TovaHasi commited on
Commit
4e707bc
1 Parent(s): 9811406

Upload Tokenization.ipynb

Browse files
Files changed (1) hide show
  1. Tokenization.ipynb +1356 -0
Tokenization.ipynb ADDED
@@ -0,0 +1,1356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 26,
6
+ "id": "90e02bcf",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from sklearn.preprocessing import LabelEncoder\n",
11
+ "import transformers\n",
12
+ "import torch\n",
13
+ "import nltk\n",
14
+ "import numpy as np\n",
15
+ "import pandas as pd"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 27,
21
+ "id": "285335e8",
22
+ "metadata": {},
23
+ "outputs": [
24
+ {
25
+ "name": "stderr",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "Using custom data configuration default-b7be9a9d71194d1a\n"
29
+ ]
30
+ },
31
+ {
32
+ "name": "stdout",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "Downloading and preparing dataset csv/default to /Users/fixed/.cache/huggingface/datasets/csv/default-b7be9a9d71194d1a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...\n"
36
+ ]
37
+ },
38
+ {
39
+ "data": {
40
+ "application/vnd.jupyter.widget-view+json": {
41
+ "model_id": "88b1a7629b6345709dbd1adb4486ec58",
42
+ "version_major": 2,
43
+ "version_minor": 0
44
+ },
45
+ "text/plain": [
46
+ "Downloading data files: 0%| | 0/3 [00:00<?, ?it/s]"
47
+ ]
48
+ },
49
+ "metadata": {},
50
+ "output_type": "display_data"
51
+ },
52
+ {
53
+ "data": {
54
+ "application/vnd.jupyter.widget-view+json": {
55
+ "model_id": "fcf84fc7b8af491b8ee7217d258bc754",
56
+ "version_major": 2,
57
+ "version_minor": 0
58
+ },
59
+ "text/plain": [
60
+ "Extracting data files: 0%| | 0/3 [00:00<?, ?it/s]"
61
+ ]
62
+ },
63
+ "metadata": {},
64
+ "output_type": "display_data"
65
+ },
66
+ {
67
+ "name": "stdout",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "Dataset csv downloaded and prepared to /Users/fixed/.cache/huggingface/datasets/csv/default-b7be9a9d71194d1a/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.\n"
71
+ ]
72
+ },
73
+ {
74
+ "data": {
75
+ "application/vnd.jupyter.widget-view+json": {
76
+ "model_id": "c47a15fa466f4b71ada94b7c317be40a",
77
+ "version_major": 2,
78
+ "version_minor": 0
79
+ },
80
+ "text/plain": [
81
+ " 0%| | 0/3 [00:00<?, ?it/s]"
82
+ ]
83
+ },
84
+ "metadata": {},
85
+ "output_type": "display_data"
86
+ }
87
+ ],
88
+ "source": [
89
+ "from datasets import load_dataset\n",
90
+ "dataset_train_test = load_dataset('csv', \n",
91
+ " data_files={'train': 'train_data.csv', 'test': 'test_data.csv', 'val': 'val_data.csv'})"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 28,
97
+ "id": "9a097810",
98
+ "metadata": {},
99
+ "outputs": [
100
+ {
101
+ "data": {
102
+ "text/plain": [
103
+ "DatasetDict({\n",
104
+ " train: Dataset({\n",
105
+ " features: ['text', 'label'],\n",
106
+ " num_rows: 44928\n",
107
+ " })\n",
108
+ " test: Dataset({\n",
109
+ " features: ['text', 'label'],\n",
110
+ " num_rows: 11981\n",
111
+ " })\n",
112
+ " val: Dataset({\n",
113
+ " features: ['text', 'label'],\n",
114
+ " num_rows: 14976\n",
115
+ " })\n",
116
+ "})"
117
+ ]
118
+ },
119
+ "execution_count": 28,
120
+ "metadata": {},
121
+ "output_type": "execute_result"
122
+ }
123
+ ],
124
+ "source": [
125
+ "dataset_train_test"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": 29,
131
+ "id": "9d9f837b",
132
+ "metadata": {},
133
+ "outputs": [
134
+ {
135
+ "data": {
136
+ "application/vnd.jupyter.widget-view+json": {
137
+ "model_id": "77c3bc469e924d90b410c9a053b3d3fd",
138
+ "version_major": 2,
139
+ "version_minor": 0
140
+ },
141
+ "text/plain": [
142
+ " 0%| | 0/45 [00:00<?, ?ba/s]"
143
+ ]
144
+ },
145
+ "metadata": {},
146
+ "output_type": "display_data"
147
+ },
148
+ {
149
+ "data": {
150
+ "application/vnd.jupyter.widget-view+json": {
151
+ "model_id": "8c94567c41c54910bcbd02f33078f057",
152
+ "version_major": 2,
153
+ "version_minor": 0
154
+ },
155
+ "text/plain": [
156
+ " 0%| | 0/12 [00:00<?, ?ba/s]"
157
+ ]
158
+ },
159
+ "metadata": {},
160
+ "output_type": "display_data"
161
+ },
162
+ {
163
+ "data": {
164
+ "application/vnd.jupyter.widget-view+json": {
165
+ "model_id": "642f18f8d18e4176b114420edfa959b9",
166
+ "version_major": 2,
167
+ "version_minor": 0
168
+ },
169
+ "text/plain": [
170
+ " 0%| | 0/15 [00:00<?, ?ba/s]"
171
+ ]
172
+ },
173
+ "metadata": {},
174
+ "output_type": "display_data"
175
+ }
176
+ ],
177
+ "source": [
178
+ "from transformers import AutoTokenizer\n",
179
+ "\n",
180
+ "tokenizer = AutoTokenizer.from_pretrained(\"distilbert-base-uncased\")\n",
181
+ "\n",
182
+ "def tokenize_function(examples):\n",
183
+ " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
184
+ "\n",
185
+ "\n",
186
+ "tokenized_datasets = dataset_train_test.map(tokenize_function, batched=True)"
187
+ ]
188
+ },
189
+ {
190
+ "cell_type": "code",
191
+ "execution_count": 37,
192
+ "id": "73d1be33",
193
+ "metadata": {},
194
+ "outputs": [
195
+ {
196
+ "data": {
197
+ "text/plain": [
198
+ "{'text': 'strategic behaviour and indicative price diffusion in paris stock exchange auctions. we report statistical regularities of the opening and closing auctions of french equities, focusing on the diffusive properties of the indicative auction price. two mechanisms are at play as the auction end time nears: the typical price change magnitude decreases, favoring underdiffusion, while the rate of these events increases, potentially leading to overdiffusion. a third mechanism, caused by the strategic behavior of traders, is needed to produce nearly diffusive prices: waiting to submit buy orders until sell orders have decreased the indicative price and vice-versa. ',\n",
199
+ " 'label': 6,\n",
200
+ " 'input_ids': [101,\n",
201
+ " 6143,\n",
202
+ " 9164,\n",
203
+ " 1998,\n",
204
+ " 24668,\n",
205
+ " 3976,\n",
206
+ " 19241,\n",
207
+ " 1999,\n",
208
+ " 3000,\n",
209
+ " 4518,\n",
210
+ " 3863,\n",
211
+ " 10470,\n",
212
+ " 2015,\n",
213
+ " 1012,\n",
214
+ " 2057,\n",
215
+ " 3189,\n",
216
+ " 7778,\n",
217
+ " 3180,\n",
218
+ " 6447,\n",
219
+ " 1997,\n",
220
+ " 1996,\n",
221
+ " 3098,\n",
222
+ " 1998,\n",
223
+ " 5494,\n",
224
+ " 10470,\n",
225
+ " 2015,\n",
226
+ " 1997,\n",
227
+ " 2413,\n",
228
+ " 1041,\n",
229
+ " 15549,\n",
230
+ " 7368,\n",
231
+ " 1010,\n",
232
+ " 7995,\n",
233
+ " 2006,\n",
234
+ " 1996,\n",
235
+ " 4487,\n",
236
+ " 4246,\n",
237
+ " 2271,\n",
238
+ " 3512,\n",
239
+ " 5144,\n",
240
+ " 1997,\n",
241
+ " 1996,\n",
242
+ " 24668,\n",
243
+ " 10470,\n",
244
+ " 3976,\n",
245
+ " 1012,\n",
246
+ " 2048,\n",
247
+ " 10595,\n",
248
+ " 2024,\n",
249
+ " 2012,\n",
250
+ " 2377,\n",
251
+ " 2004,\n",
252
+ " 1996,\n",
253
+ " 10470,\n",
254
+ " 2203,\n",
255
+ " 2051,\n",
256
+ " 2379,\n",
257
+ " 2015,\n",
258
+ " 1024,\n",
259
+ " 1996,\n",
260
+ " 5171,\n",
261
+ " 3976,\n",
262
+ " 2689,\n",
263
+ " 10194,\n",
264
+ " 17913,\n",
265
+ " 1010,\n",
266
+ " 5684,\n",
267
+ " 2075,\n",
268
+ " 2104,\n",
269
+ " 4305,\n",
270
+ " 4246,\n",
271
+ " 14499,\n",
272
+ " 1010,\n",
273
+ " 2096,\n",
274
+ " 1996,\n",
275
+ " 3446,\n",
276
+ " 1997,\n",
277
+ " 2122,\n",
278
+ " 2824,\n",
279
+ " 7457,\n",
280
+ " 1010,\n",
281
+ " 9280,\n",
282
+ " 2877,\n",
283
+ " 2000,\n",
284
+ " 2058,\n",
285
+ " 4305,\n",
286
+ " 4246,\n",
287
+ " 14499,\n",
288
+ " 1012,\n",
289
+ " 1037,\n",
290
+ " 2353,\n",
291
+ " 7337,\n",
292
+ " 1010,\n",
293
+ " 3303,\n",
294
+ " 2011,\n",
295
+ " 1996,\n",
296
+ " 6143,\n",
297
+ " 5248,\n",
298
+ " 1997,\n",
299
+ " 13066,\n",
300
+ " 1010,\n",
301
+ " 2003,\n",
302
+ " 2734,\n",
303
+ " 2000,\n",
304
+ " 3965,\n",
305
+ " 3053,\n",
306
+ " 4487,\n",
307
+ " 4246,\n",
308
+ " 2271,\n",
309
+ " 3512,\n",
310
+ " 7597,\n",
311
+ " 1024,\n",
312
+ " 3403,\n",
313
+ " 2000,\n",
314
+ " 12040,\n",
315
+ " 4965,\n",
316
+ " 4449,\n",
317
+ " 2127,\n",
318
+ " 5271,\n",
319
+ " 4449,\n",
320
+ " 2031,\n",
321
+ " 10548,\n",
322
+ " 1996,\n",
323
+ " 24668,\n",
324
+ " 3976,\n",
325
+ " 1998,\n",
326
+ " 3580,\n",
327
+ " 1011,\n",
328
+ " 18601,\n",
329
+ " 1012,\n",
330
+ " 102,\n",
331
+ " 0,\n",
332
+ " 0,\n",
333
+ " 0,\n",
334
+ " 0,\n",
335
+ " 0,\n",
336
+ " 0,\n",
337
+ " 0,\n",
338
+ " 0,\n",
339
+ " 0,\n",
340
+ " 0,\n",
341
+ " 0,\n",
342
+ " 0,\n",
343
+ " 0,\n",
344
+ " 0,\n",
345
+ " 0,\n",
346
+ " 0,\n",
347
+ " 0,\n",
348
+ " 0,\n",
349
+ " 0,\n",
350
+ " 0,\n",
351
+ " 0,\n",
352
+ " 0,\n",
353
+ " 0,\n",
354
+ " 0,\n",
355
+ " 0,\n",
356
+ " 0,\n",
357
+ " 0,\n",
358
+ " 0,\n",
359
+ " 0,\n",
360
+ " 0,\n",
361
+ " 0,\n",
362
+ " 0,\n",
363
+ " 0,\n",
364
+ " 0,\n",
365
+ " 0,\n",
366
+ " 0,\n",
367
+ " 0,\n",
368
+ " 0,\n",
369
+ " 0,\n",
370
+ " 0,\n",
371
+ " 0,\n",
372
+ " 0,\n",
373
+ " 0,\n",
374
+ " 0,\n",
375
+ " 0,\n",
376
+ " 0,\n",
377
+ " 0,\n",
378
+ " 0,\n",
379
+ " 0,\n",
380
+ " 0,\n",
381
+ " 0,\n",
382
+ " 0,\n",
383
+ " 0,\n",
384
+ " 0,\n",
385
+ " 0,\n",
386
+ " 0,\n",
387
+ " 0,\n",
388
+ " 0,\n",
389
+ " 0,\n",
390
+ " 0,\n",
391
+ " 0,\n",
392
+ " 0,\n",
393
+ " 0,\n",
394
+ " 0,\n",
395
+ " 0,\n",
396
+ " 0,\n",
397
+ " 0,\n",
398
+ " 0,\n",
399
+ " 0,\n",
400
+ " 0,\n",
401
+ " 0,\n",
402
+ " 0,\n",
403
+ " 0,\n",
404
+ " 0,\n",
405
+ " 0,\n",
406
+ " 0,\n",
407
+ " 0,\n",
408
+ " 0,\n",
409
+ " 0,\n",
410
+ " 0,\n",
411
+ " 0,\n",
412
+ " 0,\n",
413
+ " 0,\n",
414
+ " 0,\n",
415
+ " 0,\n",
416
+ " 0,\n",
417
+ " 0,\n",
418
+ " 0,\n",
419
+ " 0,\n",
420
+ " 0,\n",
421
+ " 0,\n",
422
+ " 0,\n",
423
+ " 0,\n",
424
+ " 0,\n",
425
+ " 0,\n",
426
+ " 0,\n",
427
+ " 0,\n",
428
+ " 0,\n",
429
+ " 0,\n",
430
+ " 0,\n",
431
+ " 0,\n",
432
+ " 0,\n",
433
+ " 0,\n",
434
+ " 0,\n",
435
+ " 0,\n",
436
+ " 0,\n",
437
+ " 0,\n",
438
+ " 0,\n",
439
+ " 0,\n",
440
+ " 0,\n",
441
+ " 0,\n",
442
+ " 0,\n",
443
+ " 0,\n",
444
+ " 0,\n",
445
+ " 0,\n",
446
+ " 0,\n",
447
+ " 0,\n",
448
+ " 0,\n",
449
+ " 0,\n",
450
+ " 0,\n",
451
+ " 0,\n",
452
+ " 0,\n",
453
+ " 0,\n",
454
+ " 0,\n",
455
+ " 0,\n",
456
+ " 0,\n",
457
+ " 0,\n",
458
+ " 0,\n",
459
+ " 0,\n",
460
+ " 0,\n",
461
+ " 0,\n",
462
+ " 0,\n",
463
+ " 0,\n",
464
+ " 0,\n",
465
+ " 0,\n",
466
+ " 0,\n",
467
+ " 0,\n",
468
+ " 0,\n",
469
+ " 0,\n",
470
+ " 0,\n",
471
+ " 0,\n",
472
+ " 0,\n",
473
+ " 0,\n",
474
+ " 0,\n",
475
+ " 0,\n",
476
+ " 0,\n",
477
+ " 0,\n",
478
+ " 0,\n",
479
+ " 0,\n",
480
+ " 0,\n",
481
+ " 0,\n",
482
+ " 0,\n",
483
+ " 0,\n",
484
+ " 0,\n",
485
+ " 0,\n",
486
+ " 0,\n",
487
+ " 0,\n",
488
+ " 0,\n",
489
+ " 0,\n",
490
+ " 0,\n",
491
+ " 0,\n",
492
+ " 0,\n",
493
+ " 0,\n",
494
+ " 0,\n",
495
+ " 0,\n",
496
+ " 0,\n",
497
+ " 0,\n",
498
+ " 0,\n",
499
+ " 0,\n",
500
+ " 0,\n",
501
+ " 0,\n",
502
+ " 0,\n",
503
+ " 0,\n",
504
+ " 0,\n",
505
+ " 0,\n",
506
+ " 0,\n",
507
+ " 0,\n",
508
+ " 0,\n",
509
+ " 0,\n",
510
+ " 0,\n",
511
+ " 0,\n",
512
+ " 0,\n",
513
+ " 0,\n",
514
+ " 0,\n",
515
+ " 0,\n",
516
+ " 0,\n",
517
+ " 0,\n",
518
+ " 0,\n",
519
+ " 0,\n",
520
+ " 0,\n",
521
+ " 0,\n",
522
+ " 0,\n",
523
+ " 0,\n",
524
+ " 0,\n",
525
+ " 0,\n",
526
+ " 0,\n",
527
+ " 0,\n",
528
+ " 0,\n",
529
+ " 0,\n",
530
+ " 0,\n",
531
+ " 0,\n",
532
+ " 0,\n",
533
+ " 0,\n",
534
+ " 0,\n",
535
+ " 0,\n",
536
+ " 0,\n",
537
+ " 0,\n",
538
+ " 0,\n",
539
+ " 0,\n",
540
+ " 0,\n",
541
+ " 0,\n",
542
+ " 0,\n",
543
+ " 0,\n",
544
+ " 0,\n",
545
+ " 0,\n",
546
+ " 0,\n",
547
+ " 0,\n",
548
+ " 0,\n",
549
+ " 0,\n",
550
+ " 0,\n",
551
+ " 0,\n",
552
+ " 0,\n",
553
+ " 0,\n",
554
+ " 0,\n",
555
+ " 0,\n",
556
+ " 0,\n",
557
+ " 0,\n",
558
+ " 0,\n",
559
+ " 0,\n",
560
+ " 0,\n",
561
+ " 0,\n",
562
+ " 0,\n",
563
+ " 0,\n",
564
+ " 0,\n",
565
+ " 0,\n",
566
+ " 0,\n",
567
+ " 0,\n",
568
+ " 0,\n",
569
+ " 0,\n",
570
+ " 0,\n",
571
+ " 0,\n",
572
+ " 0,\n",
573
+ " 0,\n",
574
+ " 0,\n",
575
+ " 0,\n",
576
+ " 0,\n",
577
+ " 0,\n",
578
+ " 0,\n",
579
+ " 0,\n",
580
+ " 0,\n",
581
+ " 0,\n",
582
+ " 0,\n",
583
+ " 0,\n",
584
+ " 0,\n",
585
+ " 0,\n",
586
+ " 0,\n",
587
+ " 0,\n",
588
+ " 0,\n",
589
+ " 0,\n",
590
+ " 0,\n",
591
+ " 0,\n",
592
+ " 0,\n",
593
+ " 0,\n",
594
+ " 0,\n",
595
+ " 0,\n",
596
+ " 0,\n",
597
+ " 0,\n",
598
+ " 0,\n",
599
+ " 0,\n",
600
+ " 0,\n",
601
+ " 0,\n",
602
+ " 0,\n",
603
+ " 0,\n",
604
+ " 0,\n",
605
+ " 0,\n",
606
+ " 0,\n",
607
+ " 0,\n",
608
+ " 0,\n",
609
+ " 0,\n",
610
+ " 0,\n",
611
+ " 0,\n",
612
+ " 0,\n",
613
+ " 0,\n",
614
+ " 0,\n",
615
+ " 0,\n",
616
+ " 0,\n",
617
+ " 0,\n",
618
+ " 0,\n",
619
+ " 0,\n",
620
+ " 0,\n",
621
+ " 0,\n",
622
+ " 0,\n",
623
+ " 0,\n",
624
+ " 0,\n",
625
+ " 0,\n",
626
+ " 0,\n",
627
+ " 0,\n",
628
+ " 0,\n",
629
+ " 0,\n",
630
+ " 0,\n",
631
+ " 0,\n",
632
+ " 0,\n",
633
+ " 0,\n",
634
+ " 0,\n",
635
+ " 0,\n",
636
+ " 0,\n",
637
+ " 0,\n",
638
+ " 0,\n",
639
+ " 0,\n",
640
+ " 0,\n",
641
+ " 0,\n",
642
+ " 0,\n",
643
+ " 0,\n",
644
+ " 0,\n",
645
+ " 0,\n",
646
+ " 0,\n",
647
+ " 0,\n",
648
+ " 0,\n",
649
+ " 0,\n",
650
+ " 0,\n",
651
+ " 0,\n",
652
+ " 0,\n",
653
+ " 0,\n",
654
+ " 0,\n",
655
+ " 0,\n",
656
+ " 0,\n",
657
+ " 0,\n",
658
+ " 0,\n",
659
+ " 0,\n",
660
+ " 0,\n",
661
+ " 0,\n",
662
+ " 0,\n",
663
+ " 0,\n",
664
+ " 0,\n",
665
+ " 0,\n",
666
+ " 0,\n",
667
+ " 0,\n",
668
+ " 0,\n",
669
+ " 0,\n",
670
+ " 0,\n",
671
+ " 0,\n",
672
+ " 0,\n",
673
+ " 0,\n",
674
+ " 0,\n",
675
+ " 0,\n",
676
+ " 0,\n",
677
+ " 0,\n",
678
+ " 0,\n",
679
+ " 0,\n",
680
+ " 0,\n",
681
+ " 0,\n",
682
+ " 0,\n",
683
+ " 0,\n",
684
+ " 0,\n",
685
+ " 0,\n",
686
+ " 0,\n",
687
+ " 0,\n",
688
+ " 0,\n",
689
+ " 0,\n",
690
+ " 0,\n",
691
+ " 0,\n",
692
+ " 0,\n",
693
+ " 0,\n",
694
+ " 0,\n",
695
+ " 0,\n",
696
+ " 0,\n",
697
+ " 0,\n",
698
+ " 0,\n",
699
+ " 0,\n",
700
+ " 0,\n",
701
+ " 0,\n",
702
+ " 0,\n",
703
+ " 0,\n",
704
+ " 0,\n",
705
+ " 0,\n",
706
+ " 0,\n",
707
+ " 0,\n",
708
+ " 0,\n",
709
+ " 0,\n",
710
+ " 0,\n",
711
+ " 0],\n",
712
+ " 'attention_mask': [1,\n",
713
+ " 1,\n",
714
+ " 1,\n",
715
+ " 1,\n",
716
+ " 1,\n",
717
+ " 1,\n",
718
+ " 1,\n",
719
+ " 1,\n",
720
+ " 1,\n",
721
+ " 1,\n",
722
+ " 1,\n",
723
+ " 1,\n",
724
+ " 1,\n",
725
+ " 1,\n",
726
+ " 1,\n",
727
+ " 1,\n",
728
+ " 1,\n",
729
+ " 1,\n",
730
+ " 1,\n",
731
+ " 1,\n",
732
+ " 1,\n",
733
+ " 1,\n",
734
+ " 1,\n",
735
+ " 1,\n",
736
+ " 1,\n",
737
+ " 1,\n",
738
+ " 1,\n",
739
+ " 1,\n",
740
+ " 1,\n",
741
+ " 1,\n",
742
+ " 1,\n",
743
+ " 1,\n",
744
+ " 1,\n",
745
+ " 1,\n",
746
+ " 1,\n",
747
+ " 1,\n",
748
+ " 1,\n",
749
+ " 1,\n",
750
+ " 1,\n",
751
+ " 1,\n",
752
+ " 1,\n",
753
+ " 1,\n",
754
+ " 1,\n",
755
+ " 1,\n",
756
+ " 1,\n",
757
+ " 1,\n",
758
+ " 1,\n",
759
+ " 1,\n",
760
+ " 1,\n",
761
+ " 1,\n",
762
+ " 1,\n",
763
+ " 1,\n",
764
+ " 1,\n",
765
+ " 1,\n",
766
+ " 1,\n",
767
+ " 1,\n",
768
+ " 1,\n",
769
+ " 1,\n",
770
+ " 1,\n",
771
+ " 1,\n",
772
+ " 1,\n",
773
+ " 1,\n",
774
+ " 1,\n",
775
+ " 1,\n",
776
+ " 1,\n",
777
+ " 1,\n",
778
+ " 1,\n",
779
+ " 1,\n",
780
+ " 1,\n",
781
+ " 1,\n",
782
+ " 1,\n",
783
+ " 1,\n",
784
+ " 1,\n",
785
+ " 1,\n",
786
+ " 1,\n",
787
+ " 1,\n",
788
+ " 1,\n",
789
+ " 1,\n",
790
+ " 1,\n",
791
+ " 1,\n",
792
+ " 1,\n",
793
+ " 1,\n",
794
+ " 1,\n",
795
+ " 1,\n",
796
+ " 1,\n",
797
+ " 1,\n",
798
+ " 1,\n",
799
+ " 1,\n",
800
+ " 1,\n",
801
+ " 1,\n",
802
+ " 1,\n",
803
+ " 1,\n",
804
+ " 1,\n",
805
+ " 1,\n",
806
+ " 1,\n",
807
+ " 1,\n",
808
+ " 1,\n",
809
+ " 1,\n",
810
+ " 1,\n",
811
+ " 1,\n",
812
+ " 1,\n",
813
+ " 1,\n",
814
+ " 1,\n",
815
+ " 1,\n",
816
+ " 1,\n",
817
+ " 1,\n",
818
+ " 1,\n",
819
+ " 1,\n",
820
+ " 1,\n",
821
+ " 1,\n",
822
+ " 1,\n",
823
+ " 1,\n",
824
+ " 1,\n",
825
+ " 1,\n",
826
+ " 1,\n",
827
+ " 1,\n",
828
+ " 1,\n",
829
+ " 1,\n",
830
+ " 1,\n",
831
+ " 1,\n",
832
+ " 1,\n",
833
+ " 1,\n",
834
+ " 1,\n",
835
+ " 1,\n",
836
+ " 1,\n",
837
+ " 1,\n",
838
+ " 1,\n",
839
+ " 1,\n",
840
+ " 1,\n",
841
+ " 1,\n",
842
+ " 1,\n",
843
+ " 0,\n",
844
+ " 0,\n",
845
+ " 0,\n",
846
+ " 0,\n",
847
+ " 0,\n",
848
+ " 0,\n",
849
+ " 0,\n",
850
+ " 0,\n",
851
+ " 0,\n",
852
+ " 0,\n",
853
+ " 0,\n",
854
+ " 0,\n",
855
+ " 0,\n",
856
+ " 0,\n",
857
+ " 0,\n",
858
+ " 0,\n",
859
+ " 0,\n",
860
+ " 0,\n",
861
+ " 0,\n",
862
+ " 0,\n",
863
+ " 0,\n",
864
+ " 0,\n",
865
+ " 0,\n",
866
+ " 0,\n",
867
+ " 0,\n",
868
+ " 0,\n",
869
+ " 0,\n",
870
+ " 0,\n",
871
+ " 0,\n",
872
+ " 0,\n",
873
+ " 0,\n",
874
+ " 0,\n",
875
+ " 0,\n",
876
+ " 0,\n",
877
+ " 0,\n",
878
+ " 0,\n",
879
+ " 0,\n",
880
+ " 0,\n",
881
+ " 0,\n",
882
+ " 0,\n",
883
+ " 0,\n",
884
+ " 0,\n",
885
+ " 0,\n",
886
+ " 0,\n",
887
+ " 0,\n",
888
+ " 0,\n",
889
+ " 0,\n",
890
+ " 0,\n",
891
+ " 0,\n",
892
+ " 0,\n",
893
+ " 0,\n",
894
+ " 0,\n",
895
+ " 0,\n",
896
+ " 0,\n",
897
+ " 0,\n",
898
+ " 0,\n",
899
+ " 0,\n",
900
+ " 0,\n",
901
+ " 0,\n",
902
+ " 0,\n",
903
+ " 0,\n",
904
+ " 0,\n",
905
+ " 0,\n",
906
+ " 0,\n",
907
+ " 0,\n",
908
+ " 0,\n",
909
+ " 0,\n",
910
+ " 0,\n",
911
+ " 0,\n",
912
+ " 0,\n",
913
+ " 0,\n",
914
+ " 0,\n",
915
+ " 0,\n",
916
+ " 0,\n",
917
+ " 0,\n",
918
+ " 0,\n",
919
+ " 0,\n",
920
+ " 0,\n",
921
+ " 0,\n",
922
+ " 0,\n",
923
+ " 0,\n",
924
+ " 0,\n",
925
+ " 0,\n",
926
+ " 0,\n",
927
+ " 0,\n",
928
+ " 0,\n",
929
+ " 0,\n",
930
+ " 0,\n",
931
+ " 0,\n",
932
+ " 0,\n",
933
+ " 0,\n",
934
+ " 0,\n",
935
+ " 0,\n",
936
+ " 0,\n",
937
+ " 0,\n",
938
+ " 0,\n",
939
+ " 0,\n",
940
+ " 0,\n",
941
+ " 0,\n",
942
+ " 0,\n",
943
+ " 0,\n",
944
+ " 0,\n",
945
+ " 0,\n",
946
+ " 0,\n",
947
+ " 0,\n",
948
+ " 0,\n",
949
+ " 0,\n",
950
+ " 0,\n",
951
+ " 0,\n",
952
+ " 0,\n",
953
+ " 0,\n",
954
+ " 0,\n",
955
+ " 0,\n",
956
+ " 0,\n",
957
+ " 0,\n",
958
+ " 0,\n",
959
+ " 0,\n",
960
+ " 0,\n",
961
+ " 0,\n",
962
+ " 0,\n",
963
+ " 0,\n",
964
+ " 0,\n",
965
+ " 0,\n",
966
+ " 0,\n",
967
+ " 0,\n",
968
+ " 0,\n",
969
+ " 0,\n",
970
+ " 0,\n",
971
+ " 0,\n",
972
+ " 0,\n",
973
+ " 0,\n",
974
+ " 0,\n",
975
+ " 0,\n",
976
+ " 0,\n",
977
+ " 0,\n",
978
+ " 0,\n",
979
+ " 0,\n",
980
+ " 0,\n",
981
+ " 0,\n",
982
+ " 0,\n",
983
+ " 0,\n",
984
+ " 0,\n",
985
+ " 0,\n",
986
+ " 0,\n",
987
+ " 0,\n",
988
+ " 0,\n",
989
+ " 0,\n",
990
+ " 0,\n",
991
+ " 0,\n",
992
+ " 0,\n",
993
+ " 0,\n",
994
+ " 0,\n",
995
+ " 0,\n",
996
+ " 0,\n",
997
+ " 0,\n",
998
+ " 0,\n",
999
+ " 0,\n",
1000
+ " 0,\n",
1001
+ " 0,\n",
1002
+ " 0,\n",
1003
+ " 0,\n",
1004
+ " 0,\n",
1005
+ " 0,\n",
1006
+ " 0,\n",
1007
+ " 0,\n",
1008
+ " 0,\n",
1009
+ " 0,\n",
1010
+ " 0,\n",
1011
+ " 0,\n",
1012
+ " 0,\n",
1013
+ " 0,\n",
1014
+ " 0,\n",
1015
+ " 0,\n",
1016
+ " 0,\n",
1017
+ " 0,\n",
1018
+ " 0,\n",
1019
+ " 0,\n",
1020
+ " 0,\n",
1021
+ " 0,\n",
1022
+ " 0,\n",
1023
+ " 0,\n",
1024
+ " 0,\n",
1025
+ " 0,\n",
1026
+ " 0,\n",
1027
+ " 0,\n",
1028
+ " 0,\n",
1029
+ " 0,\n",
1030
+ " 0,\n",
1031
+ " 0,\n",
1032
+ " 0,\n",
1033
+ " 0,\n",
1034
+ " 0,\n",
1035
+ " 0,\n",
1036
+ " 0,\n",
1037
+ " 0,\n",
1038
+ " 0,\n",
1039
+ " 0,\n",
1040
+ " 0,\n",
1041
+ " 0,\n",
1042
+ " 0,\n",
1043
+ " 0,\n",
1044
+ " 0,\n",
1045
+ " 0,\n",
1046
+ " 0,\n",
1047
+ " 0,\n",
1048
+ " 0,\n",
1049
+ " 0,\n",
1050
+ " 0,\n",
1051
+ " 0,\n",
1052
+ " 0,\n",
1053
+ " 0,\n",
1054
+ " 0,\n",
1055
+ " 0,\n",
1056
+ " 0,\n",
1057
+ " 0,\n",
1058
+ " 0,\n",
1059
+ " 0,\n",
1060
+ " 0,\n",
1061
+ " 0,\n",
1062
+ " 0,\n",
1063
+ " 0,\n",
1064
+ " 0,\n",
1065
+ " 0,\n",
1066
+ " 0,\n",
1067
+ " 0,\n",
1068
+ " 0,\n",
1069
+ " 0,\n",
1070
+ " 0,\n",
1071
+ " 0,\n",
1072
+ " 0,\n",
1073
+ " 0,\n",
1074
+ " 0,\n",
1075
+ " 0,\n",
1076
+ " 0,\n",
1077
+ " 0,\n",
1078
+ " 0,\n",
1079
+ " 0,\n",
1080
+ " 0,\n",
1081
+ " 0,\n",
1082
+ " 0,\n",
1083
+ " 0,\n",
1084
+ " 0,\n",
1085
+ " 0,\n",
1086
+ " 0,\n",
1087
+ " 0,\n",
1088
+ " 0,\n",
1089
+ " 0,\n",
1090
+ " 0,\n",
1091
+ " 0,\n",
1092
+ " 0,\n",
1093
+ " 0,\n",
1094
+ " 0,\n",
1095
+ " 0,\n",
1096
+ " 0,\n",
1097
+ " 0,\n",
1098
+ " 0,\n",
1099
+ " 0,\n",
1100
+ " 0,\n",
1101
+ " 0,\n",
1102
+ " 0,\n",
1103
+ " 0,\n",
1104
+ " 0,\n",
1105
+ " 0,\n",
1106
+ " 0,\n",
1107
+ " 0,\n",
1108
+ " 0,\n",
1109
+ " 0,\n",
1110
+ " 0,\n",
1111
+ " 0,\n",
1112
+ " 0,\n",
1113
+ " 0,\n",
1114
+ " 0,\n",
1115
+ " 0,\n",
1116
+ " 0,\n",
1117
+ " 0,\n",
1118
+ " 0,\n",
1119
+ " 0,\n",
1120
+ " 0,\n",
1121
+ " 0,\n",
1122
+ " 0,\n",
1123
+ " 0,\n",
1124
+ " 0,\n",
1125
+ " 0,\n",
1126
+ " 0,\n",
1127
+ " 0,\n",
1128
+ " 0,\n",
1129
+ " 0,\n",
1130
+ " 0,\n",
1131
+ " 0,\n",
1132
+ " 0,\n",
1133
+ " 0,\n",
1134
+ " 0,\n",
1135
+ " 0,\n",
1136
+ " 0,\n",
1137
+ " 0,\n",
1138
+ " 0,\n",
1139
+ " 0,\n",
1140
+ " 0,\n",
1141
+ " 0,\n",
1142
+ " 0,\n",
1143
+ " 0,\n",
1144
+ " 0,\n",
1145
+ " 0,\n",
1146
+ " 0,\n",
1147
+ " 0,\n",
1148
+ " 0,\n",
1149
+ " 0,\n",
1150
+ " 0,\n",
1151
+ " 0,\n",
1152
+ " 0,\n",
1153
+ " 0,\n",
1154
+ " 0,\n",
1155
+ " 0,\n",
1156
+ " 0,\n",
1157
+ " 0,\n",
1158
+ " 0,\n",
1159
+ " 0,\n",
1160
+ " 0,\n",
1161
+ " 0,\n",
1162
+ " 0,\n",
1163
+ " 0,\n",
1164
+ " 0,\n",
1165
+ " 0,\n",
1166
+ " 0,\n",
1167
+ " 0,\n",
1168
+ " 0,\n",
1169
+ " 0,\n",
1170
+ " 0,\n",
1171
+ " 0,\n",
1172
+ " 0,\n",
1173
+ " 0,\n",
1174
+ " 0,\n",
1175
+ " 0,\n",
1176
+ " 0,\n",
1177
+ " 0,\n",
1178
+ " 0,\n",
1179
+ " 0,\n",
1180
+ " 0,\n",
1181
+ " 0,\n",
1182
+ " 0,\n",
1183
+ " 0,\n",
1184
+ " 0,\n",
1185
+ " 0,\n",
1186
+ " 0,\n",
1187
+ " 0,\n",
1188
+ " 0,\n",
1189
+ " 0,\n",
1190
+ " 0,\n",
1191
+ " 0,\n",
1192
+ " 0,\n",
1193
+ " 0,\n",
1194
+ " 0,\n",
1195
+ " 0,\n",
1196
+ " 0,\n",
1197
+ " 0,\n",
1198
+ " 0,\n",
1199
+ " 0,\n",
1200
+ " 0,\n",
1201
+ " 0,\n",
1202
+ " 0,\n",
1203
+ " 0,\n",
1204
+ " 0,\n",
1205
+ " 0,\n",
1206
+ " 0,\n",
1207
+ " 0,\n",
1208
+ " 0,\n",
1209
+ " 0,\n",
1210
+ " 0,\n",
1211
+ " 0,\n",
1212
+ " 0,\n",
1213
+ " 0,\n",
1214
+ " 0,\n",
1215
+ " 0,\n",
1216
+ " 0,\n",
1217
+ " 0,\n",
1218
+ " 0,\n",
1219
+ " 0,\n",
1220
+ " 0,\n",
1221
+ " 0,\n",
1222
+ " 0,\n",
1223
+ " 0]}"
1224
+ ]
1225
+ },
1226
+ "execution_count": 37,
1227
+ "metadata": {},
1228
+ "output_type": "execute_result"
1229
+ }
1230
+ ],
1231
+ "source": [
1232
+ "tokenized_datasets['test'][6]"
1233
+ ]
1234
+ },
1235
+ {
1236
+ "cell_type": "code",
1237
+ "execution_count": 31,
1238
+ "id": "83fc07e1",
1239
+ "metadata": {},
1240
+ "outputs": [],
1241
+ "source": [
1242
+ "def torch_dataset(dataset):\n",
1243
+ " dataset = dataset.remove_columns([\"text\"])\n",
1244
+ " dataset = dataset.rename_column(\"label\", \"labels\")\n",
1245
+ " dataset.set_format(\"torch\")\n",
1246
+ " return dataset\n",
1247
+ "\n",
1248
+ "train_dataset = torch_dataset(tokenized_datasets['train'])\n",
1249
+ "val_dataset = torch_dataset(tokenized_datasets['val'])\n",
1250
+ "test_dataset = torch_dataset(tokenized_datasets['test'])"
1251
+ ]
1252
+ },
1253
+ {
1254
+ "cell_type": "code",
1255
+ "execution_count": 32,
1256
+ "id": "57d3951b",
1257
+ "metadata": {},
1258
+ "outputs": [
1259
+ {
1260
+ "data": {
1261
+ "application/vnd.jupyter.widget-view+json": {
1262
+ "model_id": "7fa9701569cb45f9a176a5b96c79a1fd",
1263
+ "version_major": 2,
1264
+ "version_minor": 0
1265
+ },
1266
+ "text/plain": [
1267
+ "Creating json from Arrow format: 0%| | 0/5 [00:00<?, ?ba/s]"
1268
+ ]
1269
+ },
1270
+ "metadata": {},
1271
+ "output_type": "display_data"
1272
+ },
1273
+ {
1274
+ "data": {
1275
+ "application/vnd.jupyter.widget-view+json": {
1276
+ "model_id": "454180f7bb7843d2b480069facf4e004",
1277
+ "version_major": 2,
1278
+ "version_minor": 0
1279
+ },
1280
+ "text/plain": [
1281
+ "Creating json from Arrow format: 0%| | 0/2 [00:00<?, ?ba/s]"
1282
+ ]
1283
+ },
1284
+ "metadata": {},
1285
+ "output_type": "display_data"
1286
+ },
1287
+ {
1288
+ "data": {
1289
+ "application/vnd.jupyter.widget-view+json": {
1290
+ "model_id": "d2e63dab4f9c4c60877d56d61f4761e3",
1291
+ "version_major": 2,
1292
+ "version_minor": 0
1293
+ },
1294
+ "text/plain": [
1295
+ "Creating json from Arrow format: 0%| | 0/2 [00:00<?, ?ba/s]"
1296
+ ]
1297
+ },
1298
+ "metadata": {},
1299
+ "output_type": "display_data"
1300
+ },
1301
+ {
1302
+ "data": {
1303
+ "text/plain": [
1304
+ "32655572"
1305
+ ]
1306
+ },
1307
+ "execution_count": 32,
1308
+ "metadata": {},
1309
+ "output_type": "execute_result"
1310
+ }
1311
+ ],
1312
+ "source": [
1313
+ "train_dataset.to_json(\"train_dataset.json\")\n",
1314
+ "val_dataset.to_json(\"val_dataset.json\")\n",
1315
+ "test_dataset.to_json(\"test_dataset.json\")"
1316
+ ]
1317
+ },
1318
+ {
1319
+ "cell_type": "code",
1320
+ "execution_count": null,
1321
+ "id": "30e1afbf",
1322
+ "metadata": {},
1323
+ "outputs": [],
1324
+ "source": []
1325
+ },
1326
+ {
1327
+ "cell_type": "code",
1328
+ "execution_count": null,
1329
+ "id": "3815659a",
1330
+ "metadata": {},
1331
+ "outputs": [],
1332
+ "source": []
1333
+ }
1334
+ ],
1335
+ "metadata": {
1336
+ "kernelspec": {
1337
+ "display_name": "Python 3 (ipykernel)",
1338
+ "language": "python",
1339
+ "name": "python3"
1340
+ },
1341
+ "language_info": {
1342
+ "codemirror_mode": {
1343
+ "name": "ipython",
1344
+ "version": 3
1345
+ },
1346
+ "file_extension": ".py",
1347
+ "mimetype": "text/x-python",
1348
+ "name": "python",
1349
+ "nbconvert_exporter": "python",
1350
+ "pygments_lexer": "ipython3",
1351
+ "version": "3.9.7"
1352
+ }
1353
+ },
1354
+ "nbformat": 4,
1355
+ "nbformat_minor": 5
1356
+ }