SebastianS commited on
Commit
b83af22
1 Parent(s): bc13ef6

first model version

Browse files
.ipynb_checkpoints/part4-checkpoint.ipynb CHANGED
@@ -309,10 +309,52 @@
309
  },
310
  {
311
  "cell_type": "code",
312
- "execution_count": null,
313
  "id": "06ef92c9",
314
  "metadata": {},
315
  "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  "source": []
317
  }
318
  ],
 
309
  },
310
  {
311
  "cell_type": "code",
312
+ "execution_count": 3,
313
  "id": "06ef92c9",
314
  "metadata": {},
315
  "outputs": [],
316
+ "source": [
317
+ "repo.git_pull()"
318
+ ]
319
+ },
320
+ {
321
+ "cell_type": "code",
322
+ "execution_count": 4,
323
+ "id": "3442a913",
324
+ "metadata": {},
325
+ "outputs": [
326
+ {
327
+ "name": "stderr",
328
+ "output_type": "stream",
329
+ "text": [
330
+ "To https://huggingface.co/SebastianS/dummy-model\n",
331
+ " 91d9c6c..bc13ef6 main -> main\n",
332
+ "\n"
333
+ ]
334
+ },
335
+ {
336
+ "data": {
337
+ "text/plain": [
338
+ "'https://huggingface.co/SebastianS/dummy-model/commit/bc13ef64436e852b999af0315b661eebf6fd6a42'"
339
+ ]
340
+ },
341
+ "execution_count": 4,
342
+ "metadata": {},
343
+ "output_type": "execute_result"
344
+ }
345
+ ],
346
+ "source": [
347
+ "repo.git_add()\n",
348
+ "repo.git_commit(\"added this file\")\n",
349
+ "repo.git_push()"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": null,
355
+ "id": "f036dfdd",
356
+ "metadata": {},
357
+ "outputs": [],
358
  "source": []
359
  }
360
  ],
config.json CHANGED
@@ -1,1335 +1,27 @@
1
  {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "metadata": {},
6
- "source": [
7
- "# Processing data"
8
- ]
9
- },
10
- {
11
- "cell_type": "code",
12
- "execution_count": 4,
13
- "metadata": {},
14
- "outputs": [],
15
- "source": [
16
- "import torch\n",
17
- "from torch.utils.data import DataLoader\n",
18
- "from transformers import get_scheduler, TrainingArguments, Trainer, DataCollatorWithPadding, AdamW, AutoTokenizer, AutoModelForSequenceClassification\n",
19
- "from datasets import load_dataset\n",
20
- "import gc\n",
21
- "import numpy as np\n",
22
- "from datasets import load_metric\n",
23
- "import random\n",
24
- "import os\n",
25
- "from tqdm.auto import tqdm"
26
- ]
27
- },
28
- {
29
- "cell_type": "code",
30
- "execution_count": 5,
31
- "metadata": {},
32
- "outputs": [],
33
- "source": [
34
- "os.environ['CUDA_LAUNCH_BLOCKING'] = '1'"
35
- ]
36
- },
37
- {
38
- "cell_type": "code",
39
- "execution_count": 6,
40
- "metadata": {},
41
- "outputs": [],
42
- "source": [
43
- "# reset GPU memory\n",
44
- "gc.collect()\n",
45
- "torch.cuda.empty_cache()"
46
- ]
47
- },
48
- {
49
- "cell_type": "code",
50
- "execution_count": 3,
51
- "metadata": {},
52
- "outputs": [
53
- {
54
- "ename": "NameError",
55
- "evalue": "name 'AutoTokenizer' is not defined",
56
- "output_type": "error",
57
- "traceback": [
58
- "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
59
- "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
60
- "\u001b[1;32m<ipython-input-3-f5793421e6ee>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[0mcheckpoint\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;34m\"bert-base-uncased\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mtokenizer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mAutoTokenizer\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcheckpoint\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
61
- "\u001b[1;31mNameError\u001b[0m: name 'AutoTokenizer' is not defined"
62
- ]
63
- }
64
- ],
65
- "source": [
66
- "checkpoint = \"bert-base-uncased\"\n",
67
- "tokenizer = AutoTokenizer.from_pretrained(checkpoint)"
68
- ]
69
- },
70
- {
71
- "cell_type": "code",
72
- "execution_count": 5,
73
- "metadata": {},
74
- "outputs": [
75
- {
76
- "name": "stderr",
77
- "output_type": "stream",
78
- "text": [
79
- "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']\n",
80
- "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
81
- "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
82
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
83
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
84
- ]
85
- }
86
- ],
87
- "source": [
88
- "checkpoint = \"bert-base-uncased\"\n",
89
- "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
90
- "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)"
91
- ]
92
- },
93
- {
94
- "cell_type": "code",
95
- "execution_count": 3,
96
- "metadata": {},
97
- "outputs": [],
98
- "source": [
99
- "sequences = [\n",
100
- " \"I've been waiting for a HuggingFace course my whole life.\",\n",
101
- " \"This course is amazing!\",\n",
102
- "]\n",
103
- "batch = tokenizer(sequences, padding=True, truncation=True, return_tensors=\"pt\")\n",
104
- "batch[\"labels\"] = torch.tensor([1, 1])\n",
105
- "optimizer = AdamW(model.parameters())\n",
106
- "loss = model(**batch).loss\n",
107
- "loss.backward()\n",
108
- "optimizer.step()"
109
- ]
110
- },
111
- {
112
- "cell_type": "code",
113
- "execution_count": 4,
114
- "metadata": {},
115
- "outputs": [
116
- {
117
- "name": "stderr",
118
- "output_type": "stream",
119
- "text": [
120
- "Reusing dataset glue (C:\\Users\\1seba\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
121
- ]
122
- }
123
- ],
124
- "source": [
125
- "raw_datasets = load_dataset(\"glue\",\"mrpc\")\n",
126
- "raw_train_dataset = raw_datasets['train']\n",
127
- "# print(raw_train_dataset.features)\n",
128
- "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
129
- "# # WHY CANT WE PASS THE DIFFERENT SENTENCES TOGETHER\n",
130
- "# tokenized_sentences_1 = tokenizer(raw_train_dataset[15]['sentence1'])\n",
131
- "# tokenized_sentences_2 = tokenizer(raw_train_dataset[15]['sentence2'])\n",
132
- "# print(tokenizer.decode(tokenized_sentences_1.input_ids), tokenizer.decode(tokenized_sentences_2.input_ids))\n",
133
- "# inputs = tokenizer(raw_train_dataset[15]['sentence1'], raw_train_dataset[15]['sentence2'])\n",
134
- "# print(tokenizer.decode(inputs.input_ids))\n",
135
- "inputs = tokenizer(raw_train_dataset['sentence1'], raw_train_dataset['sentence2'], padding=True, truncation=True)\n",
136
- "\n",
137
- "# tokenized_datasets = raw_datasets.map(tokenize_function, batched=False)\n",
138
- "# print(tokenized_datasets['train'].features)"
139
- ]
140
- },
141
- {
142
- "cell_type": "code",
143
- "execution_count": 5,
144
- "metadata": {},
145
- "outputs": [
146
- {
147
- "data": {
148
- "text/plain": [
149
- "['input_ids', 'token_type_ids', 'attention_mask']"
150
- ]
151
- },
152
- "execution_count": 5,
153
- "metadata": {},
154
- "output_type": "execute_result"
155
- }
156
- ],
157
- "source": [
158
- "list(inputs.keys())"
159
- ]
160
- },
161
- {
162
- "cell_type": "code",
163
- "execution_count": 6,
164
- "metadata": {},
165
- "outputs": [
166
- {
167
- "name": "stderr",
168
- "output_type": "stream",
169
- "text": [
170
- "100%|██████████| 4/4 [00:01<00:00, 3.69ba/s]\n",
171
- "100%|██████████| 1/1 [00:00<00:00, 16.42ba/s]\n",
172
- "100%|██████████| 2/2 [00:00<00:00, 6.22ba/s]\n"
173
- ]
174
- }
175
- ],
176
- "source": [
177
- "def tokenize_function(example):\n",
178
- " tokenized = tokenizer(example['sentence1'], example['sentence2'], truncation=True)\n",
179
- "# tokenized['input_ids'] = ['CHANGED!' for item in tokenized['input_ids']]\n",
180
- " return tokenized\n",
181
- "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)"
182
- ]
183
- },
184
- {
185
- "cell_type": "code",
186
- "execution_count": 9,
187
- "metadata": {},
188
- "outputs": [],
189
- "source": [
190
- "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
191
- ]
192
- },
193
- {
194
- "cell_type": "code",
195
- "execution_count": 10,
196
- "metadata": {},
197
- "outputs": [
198
- {
199
- "data": {
200
- "text/plain": [
201
- "[50, 59, 47, 67, 59, 50, 62, 32]"
202
- ]
203
- },
204
- "execution_count": 10,
205
- "metadata": {},
206
- "output_type": "execute_result"
207
- }
208
- ],
209
- "source": [
210
- "samples = tokenized_datasets[\"train\"][:8]\n",
211
- "samples = {k: v for k, v in samples.items() if k not in [\"idx\", \"sentence1\", \"sentence2\"]}\n",
212
- "[len(x) for x in samples[\"input_ids\"]]"
213
- ]
214
- },
215
- {
216
- "cell_type": "code",
217
- "execution_count": 37,
218
- "metadata": {
219
- "scrolled": true
220
- },
221
- "outputs": [
222
- {
223
- "data": {
224
- "text/plain": [
225
- "{'attention_mask': torch.Size([8, 67]),\n",
226
- " 'input_ids': torch.Size([8, 67]),\n",
227
- " 'token_type_ids': torch.Size([8, 67]),\n",
228
- " 'labels': torch.Size([8])}"
229
- ]
230
- },
231
- "execution_count": 37,
232
- "metadata": {},
233
- "output_type": "execute_result"
234
- }
235
- ],
236
- "source": [
237
- "batch = data_collator(samples)\n",
238
- "{k: v.shape for k, v in batch.items()}"
239
- ]
240
- },
241
- {
242
- "cell_type": "markdown",
243
- "metadata": {},
244
- "source": [
245
- "## Challenge 1"
246
- ]
247
- },
248
- {
249
- "cell_type": "code",
250
- "execution_count": 15,
251
- "metadata": {},
252
- "outputs": [],
253
- "source": [
254
- "from torch.utils.data import DataLoader"
255
- ]
256
- },
257
- {
258
- "cell_type": "code",
259
- "execution_count": 12,
260
- "metadata": {},
261
- "outputs": [],
262
- "source": [
263
- "samples = tokenized_datasets['test'][:8]\n",
264
- "samples = {k: samples[k] for k in list(samples.keys()) if k not in [\"idx\", \"sentence1\", \"sentence2\"]}"
265
- ]
266
- },
267
- {
268
- "cell_type": "code",
269
- "execution_count": 13,
270
- "metadata": {},
271
- "outputs": [],
272
- "source": [
273
- "padded_samples = data_collator(samples)"
274
- ]
275
- },
276
- {
277
- "cell_type": "code",
278
- "execution_count": 21,
279
- "metadata": {},
280
- "outputs": [],
281
- "source": [
282
- "\n",
283
- "train_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=True, collate_fn=data_collator)\n",
284
- "for batch in train_dataloader:\n",
285
- " print(batch['input_ids'].shape())"
286
- ]
287
- },
288
- {
289
- "cell_type": "markdown",
290
- "metadata": {},
291
- "source": [
292
- "## Challenge 2"
293
- ]
294
- },
295
- {
296
- "cell_type": "code",
297
- "execution_count": 5,
298
- "metadata": {},
299
- "outputs": [
300
- {
301
- "name": "stderr",
302
- "output_type": "stream",
303
- "text": [
304
- "Reusing dataset glue (C:\\Users\\1seba\\.cache\\huggingface\\datasets\\glue\\sst2\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n"
305
- ]
306
- }
307
- ],
308
- "source": [
309
- "raw_dataset_sst2 = load_dataset(\"glue\",\"sst2\")"
310
- ]
311
- },
312
- {
313
- "cell_type": "code",
314
- "execution_count": 6,
315
- "metadata": {},
316
- "outputs": [
317
- {
318
- "name": "stderr",
319
- "output_type": "stream",
320
- "text": [
321
- "100%|██████████| 68/68 [00:03<00:00, 18.46ba/s]\n",
322
- "100%|██████████| 1/1 [00:00<00:00, 16.67ba/s]\n",
323
- "100%|██████████| 2/2 [00:00<00:00, 16.67ba/s]\n"
324
- ]
325
- }
326
- ],
327
- "source": [
328
- "dataset_to_tokenize = raw_dataset_sst2\n",
329
- "def tokenize_dynamic(example):\n",
330
- " dynamic_sentence_list = [x for x in list(example.keys()) if x not in ['label', 'idx']]\n",
331
- " if len(dynamic_sentence_list) == 1:\n",
332
- " return tokenizer(example[dynamic_sentence_list[0]], truncation=True)\n",
333
- " else:\n",
334
- " return tokenizer(example[dynamic_sentence_list[0]], example[dynamic_sentence_list[1]], truncation=True)\n",
335
- "tokenized_datasets = dataset_to_tokenize.map(tokenize_dynamic, batched=True)"
336
- ]
337
- },
338
- {
339
- "cell_type": "code",
340
- "execution_count": 7,
341
- "metadata": {},
342
- "outputs": [],
343
- "source": [
344
- "samples = tokenized_datasets['train'][:8]\n",
345
- "samples = {k: samples[k] for k in list(samples.keys()) if k not in [\"idx\", \"sentence\", \"sentence1\", \"sentence2\"]}"
346
- ]
347
- },
348
- {
349
- "cell_type": "code",
350
- "execution_count": 8,
351
- "metadata": {},
352
- "outputs": [],
353
- "source": [
354
- "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
355
- ]
356
- },
357
- {
358
- "cell_type": "code",
359
- "execution_count": 74,
360
- "metadata": {},
361
- "outputs": [],
362
- "source": [
363
- "padded_data = data_collator(samples)"
364
- ]
365
- },
366
- {
367
- "cell_type": "markdown",
368
- "metadata": {},
369
- "source": [
370
- "# Fine-tuning a model with Trainer API"
371
- ]
372
- },
373
- {
374
- "cell_type": "code",
375
- "execution_count": 33,
376
- "metadata": {},
377
- "outputs": [
378
- {
379
- "name": "stderr",
380
- "output_type": "stream",
381
- "text": [
382
- "Reusing dataset glue (C:\\Users\\1seba\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
383
- "100%|██████████| 4/4 [00:00<00:00, 5.85ba/s]\n",
384
- "100%|██████████| 1/1 [00:00<00:00, 14.49ba/s]\n",
385
- "100%|██████████| 2/2 [00:00<00:00, 6.37ba/s]\n"
386
- ]
387
- }
388
- ],
389
- "source": [
390
- "# set up so far\n",
391
- "from datasets import load_dataset\n",
392
- "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
393
- "\n",
394
- "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
395
- "checkpoint = \"bert-base-uncased\"\n",
396
- "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
397
- "\n",
398
- "def tokenize_function(example):\n",
399
- " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
400
- "\n",
401
- "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
402
- "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
403
- ]
404
- },
405
- {
406
- "cell_type": "code",
407
- "execution_count": 9,
408
- "metadata": {},
409
- "outputs": [],
410
- "source": [
411
- "from transformers import TrainingArguments\n",
412
- "from transformers import AutoModelForSequenceClassification"
413
- ]
414
- },
415
- {
416
- "cell_type": "code",
417
- "execution_count": 34,
418
- "metadata": {},
419
- "outputs": [],
420
- "source": [
421
- "training_args = TrainingArguments(\"test-trainer\")\n",
422
- "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
423
- ]
424
- },
425
- {
426
- "cell_type": "code",
427
- "execution_count": 9,
428
- "metadata": {},
429
- "outputs": [],
430
- "source": []
431
- },
432
- {
433
- "cell_type": "code",
434
- "execution_count": 37,
435
- "metadata": {},
436
- "outputs": [
437
- {
438
- "name": "stderr",
439
- "output_type": "stream",
440
- "text": [
441
- "100%|██████████| 4/4 [00:00<00:00, 4.14ba/s]\n",
442
- "100%|██████████| 1/1 [00:00<00:00, 9.71ba/s]\n"
443
- ]
444
- }
445
- ],
446
- "source": [
447
- "train_dataset = tokenized_datasets[\"train\"].filter(percentageOfItems)\n",
448
- "validation_dataset = tokenized_datasets[\"validation\"].filter(percentageOfItems)"
449
- ]
450
- },
451
- {
452
- "cell_type": "code",
453
- "execution_count": 42,
454
- "metadata": {},
455
- "outputs": [],
456
- "source": [
457
- "trainer = Trainer(\n",
458
- " model,\n",
459
- " training_args,\n",
460
- " train_dataset=train_dataset,\n",
461
- " eval_dataset=validation_dataset,\n",
462
- " data_collator=data_collator,\n",
463
- " tokenizer=tokenizer,\n",
464
- ")"
465
- ]
466
- },
467
- {
468
- "cell_type": "code",
469
- "execution_count": null,
470
- "metadata": {},
471
- "outputs": [
472
- {
473
- "name": "stderr",
474
- "output_type": "stream",
475
- "text": [
476
- " 0%| | 0/132 [01:31<?, ?it/s]\n",
477
- "100%|██████████| 132/132 [00:44<00:00, 2.97it/s]"
478
- ]
479
- },
480
- {
481
- "name": "stdout",
482
- "output_type": "stream",
483
- "text": [
484
- "{'train_runtime': 44.4012, 'train_samples_per_second': 2.973, 'epoch': 3.0}\n"
485
- ]
486
- },
487
- {
488
- "name": "stderr",
489
- "output_type": "stream",
490
- "text": [
491
- "\n"
492
- ]
493
- },
494
- {
495
- "data": {
496
- "text/plain": [
497
- "TrainOutput(global_step=132, training_loss=0.4154145789868904, metrics={'train_runtime': 44.4012, 'train_samples_per_second': 2.973, 'epoch': 3.0})"
498
- ]
499
- },
500
- "metadata": {},
501
- "output_type": "display_data"
502
- }
503
- ],
504
- "source": [
505
- "trainer.train()"
506
- ]
507
- },
508
- {
509
- "cell_type": "code",
510
- "execution_count": 48,
511
- "metadata": {},
512
- "outputs": [
513
- {
514
- "name": "stderr",
515
- "output_type": "stream",
516
- "text": [
517
- " 80%|████████ | 4/5 [00:00<00:00, 9.37it/s]"
518
- ]
519
- },
520
- {
521
- "name": "stdout",
522
- "output_type": "stream",
523
- "text": [
524
- "(37, 2) (37,)\n"
525
- ]
526
- }
527
- ],
528
- "source": [
529
- "predictions = trainer.predict(validation_dataset)\n",
530
- "print(predictions.predictions.shape, predictions.label_ids.shape)"
531
- ]
532
- },
533
- {
534
- "cell_type": "code",
535
- "execution_count": 10,
536
- "metadata": {},
537
- "outputs": [],
538
- "source": [
539
- "import numpy as np\n",
540
- "from datasets import load_metric"
541
- ]
542
- },
543
- {
544
- "cell_type": "code",
545
- "execution_count": 49,
546
- "metadata": {},
547
- "outputs": [],
548
- "source": [
549
- "preds = np.argmax(predictions.predictions, axis=-1)"
550
- ]
551
- },
552
- {
553
- "cell_type": "code",
554
- "execution_count": 51,
555
- "metadata": {},
556
- "outputs": [
557
- {
558
- "data": {
559
- "text/plain": [
560
- "{'accuracy': 0.8378378378378378, 'f1': 0.8928571428571429}"
561
- ]
562
- },
563
- "execution_count": 51,
564
- "metadata": {},
565
- "output_type": "execute_result"
566
- }
567
- ],
568
- "source": [
569
- "metric = load_metric(\"glue\", \"mrpc\")\n",
570
- "metric.compute(predictions=preds, references=predictions.label_ids)"
571
- ]
572
- },
573
- {
574
- "cell_type": "code",
575
- "execution_count": 52,
576
- "metadata": {},
577
- "outputs": [],
578
- "source": [
579
- "def compute_metrics(eval_preds):\n",
580
- " metric = load_metric(\"glue\", \"mrpc\")\n",
581
- " logits, labels = eval_preds\n",
582
- " predictions = np.argmax(logits, axis=-1)\n",
583
- " return metric.compute(predictions=predictions, references=labels)"
584
- ]
585
- },
586
- {
587
- "cell_type": "code",
588
- "execution_count": 62,
589
- "metadata": {},
590
- "outputs": [
591
- {
592
- "name": "stderr",
593
- "output_type": "stream",
594
- "text": [
595
- "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']\n",
596
- "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
597
- "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
598
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
599
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
600
- ]
601
- }
602
- ],
603
- "source": [
604
- "training_args = TrainingArguments(\"test-trainer\", evaluation_strategy=\"epoch\")\n",
605
- "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
606
- "\n",
607
- "trainer = Trainer(\n",
608
- " model,\n",
609
- " training_args,\n",
610
- " train_dataset=train_dataset,\n",
611
- " eval_dataset=validation_dataset,\n",
612
- " data_collator=data_collator,\n",
613
- " tokenizer=tokenizer,\n",
614
- " compute_metrics=compute_metrics\n",
615
- ")"
616
- ]
617
- },
618
- {
619
- "cell_type": "code",
620
- "execution_count": 66,
621
- "metadata": {},
622
- "outputs": [
623
- {
624
- "name": "stderr",
625
- "output_type": "stream",
626
- "text": [
627
- " 1%| | 1/132 [00:19<43:22, 19.87s/it]\n",
628
- "100%|██████████| 5/5 [00:00<00:00, 17.23it/s]\n"
629
- ]
630
- },
631
- {
632
- "name": "stdout",
633
- "output_type": "stream",
634
- "text": [
635
- "{'eval_loss': 0.5742557048797607, 'eval_accuracy': 0.7027027027027027, 'eval_f1': 0.8070175438596492, 'eval_runtime': 0.9927, 'eval_samples_per_second': 37.273, 'epoch': 1.0}\n"
636
- ]
637
- },
638
- {
639
- "name": "stderr",
640
- "output_type": "stream",
641
- "text": [
642
- "100%|██████████| 5/5 [00:00<00:00, 17.03it/s]\n"
643
- ]
644
- },
645
- {
646
- "name": "stdout",
647
- "output_type": "stream",
648
- "text": [
649
- "{'eval_loss': 0.4739842414855957, 'eval_accuracy': 0.7837837837837838, 'eval_f1': 0.8620689655172413, 'eval_runtime': 0.9255, 'eval_samples_per_second': 39.977, 'epoch': 2.0}\n"
650
- ]
651
- },
652
- {
653
- "name": "stderr",
654
- "output_type": "stream",
655
- "text": [
656
- "100%|██████████| 5/5 [00:00<00:00, 16.95it/s]\n",
657
- " \n",
658
- "100%|██████████| 132/132 [00:46<00:00, 2.81it/s]"
659
- ]
660
- },
661
- {
662
- "name": "stdout",
663
- "output_type": "stream",
664
- "text": [
665
- "{'eval_loss': 0.5759992599487305, 'eval_accuracy': 0.7567567567567568, 'eval_f1': 0.8474576271186441, 'eval_runtime': 0.8269, 'eval_samples_per_second': 44.745, 'epoch': 3.0}\n",
666
- "{'train_runtime': 46.927, 'train_samples_per_second': 2.813, 'epoch': 3.0}\n"
667
- ]
668
- },
669
- {
670
- "name": "stderr",
671
- "output_type": "stream",
672
- "text": [
673
- "\n"
674
- ]
675
- },
676
- {
677
- "data": {
678
- "text/plain": [
679
- "TrainOutput(global_step=132, training_loss=0.39838010614568536, metrics={'train_runtime': 46.927, 'train_samples_per_second': 2.813, 'epoch': 3.0})"
680
- ]
681
- },
682
- "execution_count": 66,
683
- "metadata": {},
684
- "output_type": "execute_result"
685
- }
686
- ],
687
- "source": [
688
- "trainer.train()"
689
- ]
690
- },
691
- {
692
- "cell_type": "markdown",
693
- "metadata": {},
694
- "source": [
695
- "## Challenge 3"
696
- ]
697
- },
698
- {
699
- "cell_type": "code",
700
- "execution_count": 13,
701
- "metadata": {},
702
- "outputs": [
703
- {
704
- "name": "stderr",
705
- "output_type": "stream",
706
- "text": [
707
- "100%|██████████| 2/2 [00:00<00:00, 7.19ba/s]\n"
708
- ]
709
- }
710
- ],
711
- "source": [
712
- "# FILTER BREAKS THE LABELS ON THIS DATASET\n",
713
- "a = tokenized_datasets['test'].filter(lambda example, index: index % 2 == 0, with_indices=True)"
714
- ]
715
- },
716
- {
717
- "cell_type": "code",
718
- "execution_count": 21,
719
- "metadata": {},
720
- "outputs": [
721
- {
722
- "name": "stderr",
723
- "output_type": "stream",
724
- "text": [
725
- "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']\n",
726
- "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
727
- "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
728
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
729
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
730
- ]
731
- }
732
- ],
733
- "source": [
734
- "# use \"tokenized_datasets\" from challenge 2\n",
735
- "checkpoint = \"bert-base-uncased\"\n",
736
- "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
737
- "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
738
- "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
739
- "training_args = TrainingArguments('test-trainer', evaluation_strategy='epoch')\n",
740
- "train_shard = tokenized_datasets['train'].shard(num_shards=150, index=0)\n",
741
- "validation_shard = tokenized_datasets['validation'].shard(num_shards=4, index=0)\n",
742
- "metric_sst2 = load_metric('glue', 'sst2')\n",
743
- "\n",
744
- "# def compute_metrics(eval_preds):\n",
745
- "# metric = load_metric(\"glue\", \"mrpc\")\n",
746
- "# logits, labels = eval_preds\n",
747
- "# predictions = np.argmax(logits, axis=-1)\n",
748
- "# return metric.compute(predictions=predictions, references=labels)\n",
749
- "def compute_metrics (eval_preds):\n",
750
- " metric_sst2 = load_metric('glue', 'sst2')\n",
751
- " logits, labels = eval_preds\n",
752
- " predictions = np.argmax(logits, axis=-1)\n",
753
- " return metric_sst2.compute(predictions=predictions, references=labels)\n",
754
- "\n",
755
- "trainer = Trainer(\n",
756
- " model,\n",
757
- " training_args,\n",
758
- " train_dataset=train_shard,\n",
759
- " eval_dataset=validation_shard,\n",
760
- " data_collator=data_collator,\n",
761
- " tokenizer=tokenizer,\n",
762
- " compute_metrics=compute_metrics\n",
763
- ")"
764
- ]
765
- },
766
- {
767
- "cell_type": "code",
768
- "execution_count": 22,
769
- "metadata": {},
770
- "outputs": [
771
- {
772
- "name": "stderr",
773
- "output_type": "stream",
774
- "text": [
775
- "\n",
776
- " 33%|███▎ | 57/171 [00:35<00:58, 1.94it/s]"
777
- ]
778
- },
779
- {
780
- "name": "stdout",
781
- "output_type": "stream",
782
- "text": [
783
- "{'eval_loss': 0.38222888112068176, 'eval_accuracy': 0.8302752293577982, 'eval_runtime': 3.3093, 'eval_samples_per_second': 65.875, 'epoch': 1.0}\n"
784
- ]
785
- },
786
- {
787
- "name": "stderr",
788
- "output_type": "stream",
789
- "text": [
790
- "\n",
791
- " 67%|██████▋ | 114/171 [01:09<00:29, 1.93it/s]"
792
- ]
793
- },
794
- {
795
- "name": "stdout",
796
- "output_type": "stream",
797
- "text": [
798
- "{'eval_loss': 0.7558169364929199, 'eval_accuracy': 0.8165137614678899, 'eval_runtime': 3.5593, 'eval_samples_per_second': 61.248, 'epoch': 2.0}\n"
799
- ]
800
- },
801
- {
802
- "name": "stderr",
803
- "output_type": "stream",
804
- "text": [
805
- "\n",
806
- "100%|██████████| 171/171 [01:42<00:00, 1.66it/s]"
807
- ]
808
- },
809
- {
810
- "name": "stdout",
811
- "output_type": "stream",
812
- "text": [
813
- "{'eval_loss': 0.5612818598747253, 'eval_accuracy': 0.8669724770642202, 'eval_runtime': 3.3543, 'eval_samples_per_second': 64.991, 'epoch': 3.0}\n",
814
- "{'train_runtime': 102.7742, 'train_samples_per_second': 1.664, 'epoch': 3.0}\n"
815
- ]
816
- },
817
- {
818
- "name": "stderr",
819
- "output_type": "stream",
820
- "text": [
821
- "\n"
822
- ]
823
- },
824
- {
825
- "data": {
826
- "text/plain": [
827
- "TrainOutput(global_step=171, training_loss=0.276075485854121, metrics={'train_runtime': 102.7742, 'train_samples_per_second': 1.664, 'epoch': 3.0})"
828
- ]
829
- },
830
- "execution_count": 22,
831
- "metadata": {},
832
- "output_type": "execute_result"
833
- }
834
- ],
835
- "source": [
836
- "trainer.train()"
837
- ]
838
- },
839
- {
840
- "cell_type": "markdown",
841
- "metadata": {},
842
- "source": [
843
- "# A Full Training"
844
- ]
845
- },
846
- {
847
- "cell_type": "code",
848
- "execution_count": 5,
849
- "metadata": {},
850
- "outputs": [
851
- {
852
- "name": "stderr",
853
- "output_type": "stream",
854
- "text": [
855
- "Reusing dataset glue (C:\\Users\\1seba\\.cache\\huggingface\\datasets\\glue\\mrpc\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
856
- "100%|██████████| 4/4 [00:00<00:00, 7.09ba/s]\n",
857
- "100%|██████████| 1/1 [00:00<00:00, 16.39ba/s]\n",
858
- "100%|██████████| 2/2 [00:00<00:00, 9.01ba/s]\n"
859
- ]
860
- }
861
- ],
862
- "source": [
863
- "# setup\n",
864
- "from datasets import load_dataset\n",
865
- "from transformers import AutoTokenizer, DataCollatorWithPadding\n",
866
- "\n",
867
- "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
868
- "checkpoint = \"bert-base-uncased\"\n",
869
- "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
870
- "def tokenize_function(example):\n",
871
- " return tokenizer(example[\"sentence1\"], example[\"sentence2\"], truncation=True)\n",
872
- "tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)\n",
873
- "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)"
874
- ]
875
- },
876
- {
877
- "cell_type": "code",
878
- "execution_count": 6,
879
- "metadata": {},
880
- "outputs": [
881
- {
882
- "data": {
883
- "text/plain": [
884
- "['attention_mask', 'input_ids', 'labels', 'token_type_ids']"
885
- ]
886
- },
887
- "execution_count": 6,
888
- "metadata": {},
889
- "output_type": "execute_result"
890
- }
891
- ],
892
- "source": [
893
- "tokenized_datasets = tokenized_datasets.remove_columns([\"idx\", \"sentence1\", \"sentence2\"])\n",
894
- "tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')\n",
895
- "tokenized_datasets.set_format('torch')\n",
896
- "tokenized_datasets['train'].column_names"
897
- ]
898
- },
899
- {
900
- "cell_type": "code",
901
- "execution_count": 7,
902
- "metadata": {},
903
- "outputs": [],
904
- "source": [
905
- "from torch.utils.data import DataLoader\n",
906
- "train_dataloader = DataLoader(\n",
907
- " tokenized_datasets['train'].shard(num_shards=15, index=0), shuffle=True, batch_size=8, collate_fn=data_collator\n",
908
- ")\n",
909
- "eval_dataloader = DataLoader(\n",
910
- " tokenized_datasets['validation'].shard(num_shards=5, index=0), batch_size=8, collate_fn=data_collator\n",
911
- ")"
912
- ]
913
- },
914
- {
915
- "cell_type": "code",
916
- "execution_count": 60,
917
- "metadata": {},
918
- "outputs": [
919
- {
920
- "data": {
921
- "text/plain": [
922
- "{'attention_mask': torch.Size([8, 64]),\n",
923
- " 'input_ids': torch.Size([8, 64]),\n",
924
- " 'labels': torch.Size([8]),\n",
925
- " 'token_type_ids': torch.Size([8, 64])}"
926
- ]
927
- },
928
- "execution_count": 60,
929
- "metadata": {},
930
- "output_type": "execute_result"
931
- }
932
- ],
933
- "source": [
934
- "for batch in train_dataloader:\n",
935
- " break\n",
936
- "{k: v.shape for k, v in batch.items()}"
937
- ]
938
- },
939
- {
940
- "cell_type": "code",
941
- "execution_count": 61,
942
- "metadata": {},
943
- "outputs": [
944
- {
945
- "name": "stderr",
946
- "output_type": "stream",
947
- "text": [
948
- "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']\n",
949
- "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
950
- "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
951
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
952
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
953
- ]
954
- }
955
- ],
956
- "source": [
957
- "from transformers import AutoModelForSequenceClassification\n",
958
- "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
959
- ]
960
- },
961
- {
962
- "cell_type": "code",
963
- "execution_count": 62,
964
- "metadata": {},
965
- "outputs": [
966
- {
967
- "name": "stdout",
968
- "output_type": "stream",
969
- "text": [
970
- "tensor(0.5705, grad_fn=<NllLossBackward>) torch.Size([8, 2])\n"
971
- ]
972
- }
973
- ],
974
- "source": [
975
- "outputs = model(**batch)\n",
976
- "print(outputs.loss, outputs.logits.shape)"
977
- ]
978
- },
979
- {
980
- "cell_type": "code",
981
- "execution_count": 63,
982
- "metadata": {},
983
- "outputs": [],
984
- "source": [
985
- "from transformers import AdamW\n",
986
- "optimizer = AdamW(model.parameters(), lr=5e-5)"
987
- ]
988
- },
989
- {
990
- "cell_type": "code",
991
- "execution_count": 64,
992
- "metadata": {},
993
- "outputs": [
994
- {
995
- "name": "stdout",
996
- "output_type": "stream",
997
- "text": [
998
- "93\n"
999
- ]
1000
- }
1001
- ],
1002
- "source": [
1003
- "from transformers import get_scheduler\n",
1004
- "num_epochs = 3\n",
1005
- "num_training_steps = num_epochs * len(train_dataloader)\n",
1006
- "lr_scheduler = get_scheduler(\n",
1007
- " 'linear',\n",
1008
- " optimizer,\n",
1009
- " num_warmup_steps=0,\n",
1010
- " num_training_steps=num_training_steps,\n",
1011
- ")\n",
1012
- "print(num_training_steps)\n"
1013
- ]
1014
- },
1015
- {
1016
- "cell_type": "code",
1017
- "execution_count": 65,
1018
- "metadata": {},
1019
- "outputs": [
1020
- {
1021
- "data": {
1022
- "text/plain": [
1023
- "device(type='cuda')"
1024
- ]
1025
- },
1026
- "execution_count": 65,
1027
- "metadata": {},
1028
- "output_type": "execute_result"
1029
- }
1030
- ],
1031
- "source": [
1032
- "import torch\n",
1033
- "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
1034
- "model.to(device)\n",
1035
- "device"
1036
- ]
1037
- },
1038
- {
1039
- "cell_type": "code",
1040
- "execution_count": 71,
1041
- "metadata": {},
1042
- "outputs": [
1043
- {
1044
- "name": "stderr",
1045
- "output_type": "stream",
1046
- "text": [
1047
- "100%|██████████| 93/93 [08:50<00:00, 5.70s/it]\n",
1048
- "100%|██████████| 93/93 [00:28<00:00, 3.21it/s]"
1049
- ]
1050
- }
1051
- ],
1052
- "source": [
1053
- "from tqdm.auto import tqdm\n",
1054
- "progress_bar = tqdm(range(num_training_steps))\n",
1055
- "model.train()\n",
1056
- "for epoch in range(num_epochs):\n",
1057
- " for batch in train_dataloader:\n",
1058
- " batch = {k: v.to(device) for k, v in batch.items()}\n",
1059
- " outputs = model(**batch)\n",
1060
- " loss = outputs.loss\n",
1061
- " loss.backward()\n",
1062
- " optimizer.step()\n",
1063
- " optimizer.zero_grad()\n",
1064
- " progress_bar.update(1)\n",
1065
- " \n",
1066
- " # metric = load_metric('glue', 'mrpc')\n",
1067
- " # model.eval()\n",
1068
- " # for batch in eval_dataloader:\n",
1069
- " # batch = {k: v.to(device) for k, v in batch.items()}\n",
1070
- " # with torch.no_grad():\n",
1071
- " # outputs = model(**batch)\n",
1072
- " # logits = outputs.logits\n",
1073
- " # predictions = torch.argmax(logits, dim=-1)\n",
1074
- " # metric.add_batch(predictions=predictions, references=batch['labels'])\n",
1075
- " # print(metric.compute())"
1076
- ]
1077
- },
1078
- {
1079
- "cell_type": "code",
1080
- "execution_count": 109,
1081
- "metadata": {},
1082
- "outputs": [
1083
- {
1084
- "data": {
1085
- "text/plain": [
1086
- "{'accuracy': 0.6463414634146342, 'f1': 0.7851851851851851}"
1087
- ]
1088
- },
1089
- "execution_count": 109,
1090
- "metadata": {},
1091
- "output_type": "execute_result"
1092
- }
1093
- ],
1094
- "source": [
1095
- "from datasets import load_metric\n",
1096
- "metric = load_metric('glue', 'mrpc')\n",
1097
- "model.eval()\n",
1098
- "for batch in eval_dataloader:\n",
1099
- " batch = {k: v.to(device) for k, v in batch.items()}\n",
1100
- " with torch.no_grad():\n",
1101
- " outputs = model(**batch)\n",
1102
- " logits = outputs.logits\n",
1103
- " predictions = torch.argmax(logits, dim=-1)\n",
1104
- " metric.add_batch(predictions=predictions, references=batch['labels'])\n",
1105
- "metric.compute()"
1106
- ]
1107
- },
1108
- {
1109
- "cell_type": "markdown",
1110
- "metadata": {},
1111
- "source": [
1112
- "## Challenge 1"
1113
- ]
1114
- },
1115
- {
1116
- "cell_type": "code",
1117
- "execution_count": 20,
1118
- "metadata": {},
1119
- "outputs": [
1120
- {
1121
- "name": "stderr",
1122
- "output_type": "stream",
1123
- "text": [
1124
- "Reusing dataset glue (C:\\Users\\1seba\\.cache\\huggingface\\datasets\\glue\\sst2\\1.0.0\\dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)\n",
1125
- "100%|██████████| 68/68 [00:03<00:00, 20.33ba/s]\n",
1126
- "100%|██████████| 1/1 [00:00<00:00, 17.24ba/s]\n",
1127
- "100%|██████████| 2/2 [00:00<00:00, 16.53ba/s]\n"
1128
- ]
1129
- }
1130
- ],
1131
- "source": [
1132
- "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
1133
- "\n",
1134
- "sst2_datasets = load_dataset(\"glue\", \"sst2\")\n",
1135
- "def tokenize_function (example):\n",
1136
- " return tokenizer(example['sentence'], truncation=True)\n",
1137
- "tokenized_datasets = sst2_datasets.map(tokenize_function, batched=True)\n",
1138
- "tokenized_datasets = tokenized_datasets.remove_columns([\"idx\", \"sentence\"])\n",
1139
- "tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')\n",
1140
- "tokenized_datasets.set_format('torch')\n",
1141
- "data_collator = DataCollatorWithPadding(tokenizer=tokenizer)\n",
1142
- "train_dataset = DataLoader(\n",
1143
- " tokenized_datasets['train'].shard(num_shards=180, index=0), shuffle=True, batch_size=8, collate_fn=data_collator\n",
1144
- ")\n",
1145
- "eval_dataset = DataLoader(\n",
1146
- " tokenized_datasets['validation'].shard(num_shards=4, index=0), batch_size=8, collate_fn=data_collator\n",
1147
- ")"
1148
- ]
1149
- },
1150
- {
1151
- "cell_type": "code",
1152
- "execution_count": 31,
1153
- "metadata": {},
1154
- "outputs": [
1155
- {
1156
- "name": "stderr",
1157
- "output_type": "stream",
1158
- "text": [
1159
- "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']\n",
1160
- "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
1161
- "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
1162
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
1163
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
1164
- "100%|██████████| 141/141 [18:15<00:00, 7.77s/it]\n",
1165
- "100%|██████████| 141/141 [01:12<00:00, 2.21it/s]"
1166
- ]
1167
- },
1168
- {
1169
- "name": "stdout",
1170
- "output_type": "stream",
1171
- "text": [
1172
- "[{'accuracy': 0.7568807339449541}, {'accuracy': 0.8256880733944955}, {'accuracy': 0.8623853211009175}]\n"
1173
- ]
1174
- }
1175
- ],
1176
- "source": [
1177
- "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
1178
- "model.to(device)\n",
1179
- "optimizer= AdamW(model.parameters(), 5e-5)\n",
1180
- "\n",
1181
- "num_epochs = 3\n",
1182
- "num_training_steps = num_epochs * len(train_dataset)\n",
1183
- "lr_scheduler = get_scheduler(\n",
1184
- " 'linear',\n",
1185
- " optimizer=optimizer,\n",
1186
- " num_warmup_steps=0,\n",
1187
- " num_training_steps=num_training_steps,\n",
1188
- ")\n",
1189
- "\n",
1190
- "metrics = []\n",
1191
- "\n",
1192
- "progress_bar = tqdm(range(num_training_steps))\n",
1193
- "model.train()\n",
1194
- "for epoch in range(num_epochs):\n",
1195
- " for batch in train_dataset:\n",
1196
- " batch = {k: v.to(device) for k, v in batch.items()}\n",
1197
- " outputs = model(**batch)\n",
1198
- " loss = outputs.loss\n",
1199
- " loss.backward()\n",
1200
- " optimizer.step()\n",
1201
- " lr_scheduler.step()\n",
1202
- " optimizer.zero_grad()\n",
1203
- " progress_bar.update(1)\n",
1204
- "\n",
1205
- " metric= load_metric(\"glue\", \"sst2\")\n",
1206
- " model.eval()\n",
1207
- " for batch in eval_dataset:\n",
1208
- " batch = {k: v.to(device) for k, v in batch.items()}\n",
1209
- " with torch.no_grad():\n",
1210
- " outputs = model(**batch)\n",
1211
- " logits = outputs.logits\n",
1212
- " predictions = torch.argmax(logits, dim=-1)\n",
1213
- " metric.add_batch(predictions=predictions, references=batch[\"labels\"])\n",
1214
- " metrics.append(metric.compute())\n",
1215
- "\n",
1216
- "print(metrics)"
1217
- ]
1218
- },
1219
- {
1220
- "cell_type": "markdown",
1221
- "metadata": {},
1222
- "source": [
1223
- "## (end)"
1224
- ]
1225
- },
1226
- {
1227
- "cell_type": "code",
1228
- "execution_count": 8,
1229
- "metadata": {},
1230
- "outputs": [],
1231
- "source": [
1232
- "from accelerate import Accelerator\n",
1233
- "accelerator = Accelerator()"
1234
- ]
1235
- },
1236
- {
1237
- "cell_type": "code",
1238
- "execution_count": 9,
1239
- "metadata": {},
1240
- "outputs": [
1241
- {
1242
- "name": "stderr",
1243
- "output_type": "stream",
1244
- "text": [
1245
- "Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']\n",
1246
- "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
1247
- "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
1248
- "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
1249
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
1250
- "100%|██████████| 93/93 [01:11<00:00, 1.85it/s]"
1251
- ]
1252
- },
1253
- {
1254
- "name": "stdout",
1255
- "output_type": "stream",
1256
- "text": [
1257
- "[{'accuracy': 0.6707317073170732}, {'accuracy': 0.7073170731707317}, {'accuracy': 0.7560975609756098}]\n"
1258
- ]
1259
- }
1260
- ],
1261
- "source": [
1262
- "model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
1263
- "optimizer= AdamW(model.parameters(), 5e-5)\n",
1264
- "train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(\n",
1265
- " train_dataloader, eval_dataloader, model, optimizer\n",
1266
- ")\n",
1267
- "\n",
1268
- "num_epochs = 3\n",
1269
- "num_training_steps = num_epochs * len(train_dataloader)\n",
1270
- "lr_scheduler = get_scheduler(\n",
1271
- " 'linear',\n",
1272
- " optimizer=optimizer,\n",
1273
- " num_warmup_steps=0,\n",
1274
- " num_training_steps=num_training_steps,\n",
1275
- ")\n",
1276
- "\n",
1277
- "metrics = []\n",
1278
- "\n",
1279
- "progress_bar = tqdm(range(num_training_steps))\n",
1280
- "model.train()\n",
1281
- "for epoch in range(num_epochs):\n",
1282
- " for batch in train_dataloader:\n",
1283
- " outputs = model(**batch)\n",
1284
- " loss = outputs.loss\n",
1285
- " accelerator.backward(loss)\n",
1286
- " optimizer.step()\n",
1287
- " lr_scheduler.step()\n",
1288
- " optimizer.zero_grad()\n",
1289
- " progress_bar.update(1)\n",
1290
- "\n",
1291
- " metric= load_metric(\"glue\", \"sst2\")\n",
1292
- " model.eval()\n",
1293
- " for batch in eval_dataloader:\n",
1294
- " with torch.no_grad():\n",
1295
- " outputs = model(**batch)\n",
1296
- " logits = outputs.logits\n",
1297
- " predictions = torch.argmax(logits, dim=-1)\n",
1298
- " metric.add_batch(predictions=predictions, references=batch[\"labels\"])\n",
1299
- " metrics.append(metric.compute())\n",
1300
- "\n",
1301
- "print(metrics)"
1302
- ]
1303
- },
1304
- {
1305
- "cell_type": "code",
1306
- "execution_count": null,
1307
- "metadata": {},
1308
- "outputs": [],
1309
- "source": []
1310
- }
1311
- ],
1312
- "metadata": {
1313
- "interpreter": {
1314
- "hash": "c23364dc34acf6d559b2ccbb804894040b11f1b7cd300b891de29d32dea3c2c2"
1315
- },
1316
- "kernelspec": {
1317
- "display_name": "Python 3.8.10 64-bit ('AI': conda)",
1318
- "name": "python3"
1319
- },
1320
- "language_info": {
1321
- "codemirror_mode": {
1322
- "name": "ipython",
1323
- "version": 3
1324
- },
1325
- "file_extension": ".py",
1326
- "mimetype": "text/x-python",
1327
- "name": "python",
1328
- "nbconvert_exporter": "python",
1329
- "pygments_lexer": "ipython3",
1330
- "version": "3.8.10"
1331
- }
1332
- },
1333
- "nbformat": 4,
1334
- "nbformat_minor": 5
1335
  }
 
1
  {
2
+ "_name_or_path": "camembert-base",
3
+ "architectures": [
4
+ "CamembertForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 5,
8
+ "eos_token_id": 6,
9
+ "gradient_checkpointing": false,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 3072,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 514,
17
+ "model_type": "camembert",
18
+ "num_attention_heads": 12,
19
+ "num_hidden_layers": 12,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "transformers_version": "4.6.1",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 32005
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  }
hello.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Hello
part4.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": 9,
6
  "id": "aa7a358a",
7
  "metadata": {},
8
  "outputs": [],
@@ -12,100 +12,25 @@
12
  },
13
  {
14
  "cell_type": "code",
15
- "execution_count": 10,
16
  "id": "c7e39f7f",
17
  "metadata": {},
18
- "outputs": [
19
- {
20
- "data": {
21
- "application/vnd.jupyter.widget-view+json": {
22
- "model_id": "f07d3dc0c67842c5905d2a8d9bbc0ee8",
23
- "version_major": 2,
24
- "version_minor": 0
25
- },
26
- "text/plain": [
27
- "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=810912.0, style=ProgressStyle(descripti…"
28
- ]
29
- },
30
- "metadata": {},
31
- "output_type": "display_data"
32
- },
33
- {
34
- "name": "stdout",
35
- "output_type": "stream",
36
- "text": [
37
- "\n"
38
- ]
39
- },
40
- {
41
- "data": {
42
- "application/vnd.jupyter.widget-view+json": {
43
- "model_id": "bafae4f91f7e490087300d6fcd12ad15",
44
- "version_major": 2,
45
- "version_minor": 0
46
- },
47
- "text/plain": [
48
- "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1395301.0, style=ProgressStyle(descript…"
49
- ]
50
- },
51
- "metadata": {},
52
- "output_type": "display_data"
53
- },
54
- {
55
- "name": "stdout",
56
- "output_type": "stream",
57
- "text": [
58
- "\n"
59
- ]
60
- },
61
- {
62
- "data": {
63
- "application/vnd.jupyter.widget-view+json": {
64
- "model_id": "55b5654c906441d3bba3d48c72a373f2",
65
- "version_major": 2,
66
- "version_minor": 0
67
- },
68
- "text/plain": [
69
- "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=508.0, style=ProgressStyle(description_…"
70
- ]
71
- },
72
- "metadata": {},
73
- "output_type": "display_data"
74
- },
75
- {
76
- "name": "stdout",
77
- "output_type": "stream",
78
- "text": [
79
- "\n"
80
- ]
81
- },
82
- {
83
- "data": {
84
- "application/vnd.jupyter.widget-view+json": {
85
- "model_id": "659a9e30adb94f24bd78d87fb4f7706d",
86
- "version_major": 2,
87
- "version_minor": 0
88
- },
89
- "text/plain": [
90
- "HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445032417.0, style=ProgressStyle(descri…"
91
- ]
92
- },
93
- "metadata": {},
94
- "output_type": "display_data"
95
- },
96
- {
97
- "name": "stdout",
98
- "output_type": "stream",
99
- "text": [
100
- "\n"
101
- ]
102
- }
103
- ],
104
  "source": [
105
  "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n",
106
  "model = CamembertForMaskedLM.from_pretrained(\"camembert-base\")"
107
  ]
108
  },
 
 
 
 
 
 
 
 
 
 
109
  {
110
  "cell_type": "code",
111
  "execution_count": 1,
@@ -317,10 +242,42 @@
317
  "repo.git_pull()"
318
  ]
319
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  {
321
  "cell_type": "code",
322
  "execution_count": null,
323
- "id": "574529d9",
324
  "metadata": {},
325
  "outputs": [],
326
  "source": []
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "id": "aa7a358a",
7
  "metadata": {},
8
  "outputs": [],
 
12
  },
13
  {
14
  "cell_type": "code",
15
+ "execution_count": 2,
16
  "id": "c7e39f7f",
17
  "metadata": {},
18
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "source": [
20
  "tokenizer = CamembertTokenizer.from_pretrained(\"camembert-base\")\n",
21
  "model = CamembertForMaskedLM.from_pretrained(\"camembert-base\")"
22
  ]
23
  },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 3,
27
+ "id": "30ca41f5",
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "model.save_pretrained(\"./\")"
32
+ ]
33
+ },
34
  {
35
  "cell_type": "code",
36
  "execution_count": 1,
 
242
  "repo.git_pull()"
243
  ]
244
  },
245
+ {
246
+ "cell_type": "code",
247
+ "execution_count": 4,
248
+ "id": "3442a913",
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "name": "stderr",
253
+ "output_type": "stream",
254
+ "text": [
255
+ "To https://huggingface.co/SebastianS/dummy-model\n",
256
+ " 91d9c6c..bc13ef6 main -> main\n",
257
+ "\n"
258
+ ]
259
+ },
260
+ {
261
+ "data": {
262
+ "text/plain": [
263
+ "'https://huggingface.co/SebastianS/dummy-model/commit/bc13ef64436e852b999af0315b661eebf6fd6a42'"
264
+ ]
265
+ },
266
+ "execution_count": 4,
267
+ "metadata": {},
268
+ "output_type": "execute_result"
269
+ }
270
+ ],
271
+ "source": [
272
+ "repo.git_add()\n",
273
+ "repo.git_commit(\"added this file\")\n",
274
+ "repo.git_push()"
275
+ ]
276
+ },
277
  {
278
  "cell_type": "code",
279
  "execution_count": null,
280
+ "id": "f036dfdd",
281
  "metadata": {},
282
  "outputs": [],
283
  "source": []
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23e3c7b2ac552b676f86d627ac840b9138181091c281d047caa0fa638c5e562a
3
+ size 442709831