josipabebic commited on
Commit
573e720
·
verified ·
1 Parent(s): ac367c8

Delete BERTić code.ipynb

Browse files
Files changed (1) hide show
  1. BERTić code.ipynb +0 -877
BERTić code.ipynb DELETED
@@ -1,877 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "ac15a924",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "name": "stdout",
11
- "output_type": "stream",
12
- "text": [
13
- "\n",
14
- "\n",
15
- "=== Treniranje i evaluacija za trening skup: train_combined ===\n",
16
- "\n",
17
- "--- Fine-tuning model: classla/bcms-bertic ---\n"
18
- ]
19
- },
20
- {
21
- "name": "stderr",
22
- "output_type": "stream",
23
- "text": [
24
- "Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
25
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
26
- ]
27
- },
28
- {
29
- "data": {
30
- "application/vnd.jupyter.widget-view+json": {
31
- "model_id": "aed062bba63e49b5a657521638acc86c",
32
- "version_major": 2,
33
- "version_minor": 0
34
- },
35
- "text/plain": [
36
- "Map: 0%| | 0/7577 [00:00<?, ? examples/s]"
37
- ]
38
- },
39
- "metadata": {},
40
- "output_type": "display_data"
41
- },
42
- {
43
- "name": "stderr",
44
- "output_type": "stream",
45
- "text": [
46
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
47
- " warnings.warn(warn_msg)\n"
48
- ]
49
- },
50
- {
51
- "data": {
52
- "text/html": [
53
- "\n",
54
- " <div>\n",
55
- " \n",
56
- " <progress value='1422' max='1422' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
57
- " [1422/1422 45:18, Epoch 3/3]\n",
58
- " </div>\n",
59
- " <table border=\"1\" class=\"dataframe\">\n",
60
- " <thead>\n",
61
- " <tr style=\"text-align: left;\">\n",
62
- " <th>Step</th>\n",
63
- " <th>Training Loss</th>\n",
64
- " </tr>\n",
65
- " </thead>\n",
66
- " <tbody>\n",
67
- " <tr>\n",
68
- " <td>50</td>\n",
69
- " <td>0.929600</td>\n",
70
- " </tr>\n",
71
- " <tr>\n",
72
- " <td>100</td>\n",
73
- " <td>0.843100</td>\n",
74
- " </tr>\n",
75
- " <tr>\n",
76
- " <td>150</td>\n",
77
- " <td>0.744100</td>\n",
78
- " </tr>\n",
79
- " <tr>\n",
80
- " <td>200</td>\n",
81
- " <td>0.645000</td>\n",
82
- " </tr>\n",
83
- " <tr>\n",
84
- " <td>250</td>\n",
85
- " <td>0.633000</td>\n",
86
- " </tr>\n",
87
- " <tr>\n",
88
- " <td>300</td>\n",
89
- " <td>0.641400</td>\n",
90
- " </tr>\n",
91
- " <tr>\n",
92
- " <td>350</td>\n",
93
- " <td>0.618200</td>\n",
94
- " </tr>\n",
95
- " <tr>\n",
96
- " <td>400</td>\n",
97
- " <td>0.594200</td>\n",
98
- " </tr>\n",
99
- " <tr>\n",
100
- " <td>450</td>\n",
101
- " <td>0.578800</td>\n",
102
- " </tr>\n",
103
- " <tr>\n",
104
- " <td>500</td>\n",
105
- " <td>0.484900</td>\n",
106
- " </tr>\n",
107
- " <tr>\n",
108
- " <td>550</td>\n",
109
- " <td>0.436400</td>\n",
110
- " </tr>\n",
111
- " <tr>\n",
112
- " <td>600</td>\n",
113
- " <td>0.485900</td>\n",
114
- " </tr>\n",
115
- " <tr>\n",
116
- " <td>650</td>\n",
117
- " <td>0.484800</td>\n",
118
- " </tr>\n",
119
- " <tr>\n",
120
- " <td>700</td>\n",
121
- " <td>0.509100</td>\n",
122
- " </tr>\n",
123
- " <tr>\n",
124
- " <td>750</td>\n",
125
- " <td>0.437300</td>\n",
126
- " </tr>\n",
127
- " <tr>\n",
128
- " <td>800</td>\n",
129
- " <td>0.518500</td>\n",
130
- " </tr>\n",
131
- " <tr>\n",
132
- " <td>850</td>\n",
133
- " <td>0.512200</td>\n",
134
- " </tr>\n",
135
- " <tr>\n",
136
- " <td>900</td>\n",
137
- " <td>0.410600</td>\n",
138
- " </tr>\n",
139
- " <tr>\n",
140
- " <td>950</td>\n",
141
- " <td>0.471700</td>\n",
142
- " </tr>\n",
143
- " <tr>\n",
144
- " <td>1000</td>\n",
145
- " <td>0.401200</td>\n",
146
- " </tr>\n",
147
- " <tr>\n",
148
- " <td>1050</td>\n",
149
- " <td>0.374100</td>\n",
150
- " </tr>\n",
151
- " <tr>\n",
152
- " <td>1100</td>\n",
153
- " <td>0.397300</td>\n",
154
- " </tr>\n",
155
- " <tr>\n",
156
- " <td>1150</td>\n",
157
- " <td>0.363700</td>\n",
158
- " </tr>\n",
159
- " <tr>\n",
160
- " <td>1200</td>\n",
161
- " <td>0.325300</td>\n",
162
- " </tr>\n",
163
- " <tr>\n",
164
- " <td>1250</td>\n",
165
- " <td>0.291000</td>\n",
166
- " </tr>\n",
167
- " <tr>\n",
168
- " <td>1300</td>\n",
169
- " <td>0.335900</td>\n",
170
- " </tr>\n",
171
- " <tr>\n",
172
- " <td>1350</td>\n",
173
- " <td>0.395900</td>\n",
174
- " </tr>\n",
175
- " <tr>\n",
176
- " <td>1400</td>\n",
177
- " <td>0.331800</td>\n",
178
- " </tr>\n",
179
- " </tbody>\n",
180
- "</table><p>"
181
- ],
182
- "text/plain": [
183
- "<IPython.core.display.HTML object>"
184
- ]
185
- },
186
- "metadata": {},
187
- "output_type": "display_data"
188
- },
189
- {
190
- "name": "stderr",
191
- "output_type": "stream",
192
- "text": [
193
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
194
- " warnings.warn(warn_msg)\n"
195
- ]
196
- },
197
- {
198
- "name": "stdout",
199
- "output_type": "stream",
200
- "text": [
201
- "\n",
202
- "Evaluacija na test skupu test-1\n"
203
- ]
204
- },
205
- {
206
- "data": {
207
- "application/vnd.jupyter.widget-view+json": {
208
- "model_id": "fb87532e026a4d0b8aa72d5cbfc4fb2a",
209
- "version_major": 2,
210
- "version_minor": 0
211
- },
212
- "text/plain": [
213
- "Map: 0%| | 0/653 [00:00<?, ? examples/s]"
214
- ]
215
- },
216
- "metadata": {},
217
- "output_type": "display_data"
218
- },
219
- {
220
- "name": "stderr",
221
- "output_type": "stream",
222
- "text": [
223
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
224
- " warnings.warn(warn_msg)\n"
225
- ]
226
- },
227
- {
228
- "data": {
229
- "text/html": [],
230
- "text/plain": [
231
- "<IPython.core.display.HTML object>"
232
- ]
233
- },
234
- "metadata": {},
235
- "output_type": "display_data"
236
- },
237
- {
238
- "name": "stdout",
239
- "output_type": "stream",
240
- "text": [
241
- "Evaluacija: {'eval_loss': 0.8313503265380859, 'eval_accuracy': 0.7136294027565084, 'eval_f1_macro': 0.624180014657386, 'eval_runtime': 16.74, 'eval_samples_per_second': 39.008, 'eval_steps_per_second': 1.254, 'epoch': 3.0}\n"
242
- ]
243
- },
244
- {
245
- "name": "stderr",
246
- "output_type": "stream",
247
- "text": [
248
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
249
- " warnings.warn(warn_msg)\n"
250
- ]
251
- },
252
- {
253
- "name": "stdout",
254
- "output_type": "stream",
255
- "text": [
256
- "Confusion Matrix:\n",
257
- "[[109 48 8]\n",
258
- " [ 70 328 32]\n",
259
- " [ 4 25 29]]\n",
260
- "\n",
261
- "Classification Report:\n",
262
- " precision recall f1-score support\n",
263
- "\n",
264
- " negative 0.60 0.66 0.63 165\n",
265
- " neutral 0.82 0.76 0.79 430\n",
266
- " positive 0.42 0.50 0.46 58\n",
267
- "\n",
268
- " accuracy 0.71 653\n",
269
- " macro avg 0.61 0.64 0.62 653\n",
270
- "weighted avg 0.73 0.71 0.72 653\n",
271
- "\n",
272
- "Predikcije spremljene u results_train_combined/predictions_test_1.csv\n",
273
- "\n",
274
- "Evaluacija na test skupu test-2\n"
275
- ]
276
- },
277
- {
278
- "data": {
279
- "application/vnd.jupyter.widget-view+json": {
280
- "model_id": "109da4be6d1d4c0d826f1320e481d4e1",
281
- "version_major": 2,
282
- "version_minor": 0
283
- },
284
- "text/plain": [
285
- "Map: 0%| | 0/741 [00:00<?, ? examples/s]"
286
- ]
287
- },
288
- "metadata": {},
289
- "output_type": "display_data"
290
- },
291
- {
292
- "name": "stderr",
293
- "output_type": "stream",
294
- "text": [
295
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
296
- " warnings.warn(warn_msg)\n"
297
- ]
298
- },
299
- {
300
- "data": {
301
- "text/html": [],
302
- "text/plain": [
303
- "<IPython.core.display.HTML object>"
304
- ]
305
- },
306
- "metadata": {},
307
- "output_type": "display_data"
308
- },
309
- {
310
- "name": "stdout",
311
- "output_type": "stream",
312
- "text": [
313
- "Evaluacija: {'eval_loss': 0.23835134506225586, 'eval_accuracy': 0.9257759784075573, 'eval_f1_macro': 0.907760132195386, 'eval_runtime': 19.6933, 'eval_samples_per_second': 37.627, 'eval_steps_per_second': 1.219, 'epoch': 3.0}\n"
314
- ]
315
- },
316
- {
317
- "name": "stderr",
318
- "output_type": "stream",
319
- "text": [
320
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
321
- " warnings.warn(warn_msg)\n"
322
- ]
323
- },
324
- {
325
- "name": "stdout",
326
- "output_type": "stream",
327
- "text": [
328
- "Confusion Matrix:\n",
329
- "[[197 16 3]\n",
330
- " [ 14 410 7]\n",
331
- " [ 3 12 79]]\n",
332
- "\n",
333
- "Classification Report:\n",
334
- " precision recall f1-score support\n",
335
- "\n",
336
- " negative 0.92 0.91 0.92 216\n",
337
- " neutral 0.94 0.95 0.94 431\n",
338
- " positive 0.89 0.84 0.86 94\n",
339
- "\n",
340
- " accuracy 0.93 741\n",
341
- " macro avg 0.91 0.90 0.91 741\n",
342
- "weighted avg 0.93 0.93 0.93 741\n",
343
- "\n",
344
- "Predikcije spremljene u results_train_combined/predictions_test_2.csv\n",
345
- "\n",
346
- "Evaluacija na test skupu test-3\n"
347
- ]
348
- },
349
- {
350
- "data": {
351
- "application/vnd.jupyter.widget-view+json": {
352
- "model_id": "3b91d6b9cfde4469a865dd69c79b4f34",
353
- "version_major": 2,
354
- "version_minor": 0
355
- },
356
- "text/plain": [
357
- "Map: 0%| | 0/793 [00:00<?, ? examples/s]"
358
- ]
359
- },
360
- "metadata": {},
361
- "output_type": "display_data"
362
- },
363
- {
364
- "name": "stderr",
365
- "output_type": "stream",
366
- "text": [
367
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
368
- " warnings.warn(warn_msg)\n"
369
- ]
370
- },
371
- {
372
- "data": {
373
- "text/html": [],
374
- "text/plain": [
375
- "<IPython.core.display.HTML object>"
376
- ]
377
- },
378
- "metadata": {},
379
- "output_type": "display_data"
380
- },
381
- {
382
- "name": "stdout",
383
- "output_type": "stream",
384
- "text": [
385
- "Evaluacija: {'eval_loss': 0.8141497373580933, 'eval_accuracy': 0.7679697351828499, 'eval_f1_macro': 0.7678761268324849, 'eval_runtime': 20.857, 'eval_samples_per_second': 38.021, 'eval_steps_per_second': 1.199, 'epoch': 3.0}\n"
386
- ]
387
- },
388
- {
389
- "name": "stderr",
390
- "output_type": "stream",
391
- "text": [
392
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
393
- " warnings.warn(warn_msg)\n"
394
- ]
395
- },
396
- {
397
- "name": "stdout",
398
- "output_type": "stream",
399
- "text": [
400
- "Confusion Matrix:\n",
401
- "[[212 51 4]\n",
402
- " [ 7 250 6]\n",
403
- " [ 6 110 147]]\n",
404
- "\n",
405
- "Classification Report:\n",
406
- " precision recall f1-score support\n",
407
- "\n",
408
- " negative 0.94 0.79 0.86 267\n",
409
- " neutral 0.61 0.95 0.74 263\n",
410
- " positive 0.94 0.56 0.70 263\n",
411
- "\n",
412
- " accuracy 0.77 793\n",
413
- " macro avg 0.83 0.77 0.77 793\n",
414
- "weighted avg 0.83 0.77 0.77 793\n",
415
- "\n",
416
- "Predikcije spremljene u results_train_combined/predictions_test_3.csv\n",
417
- "\n",
418
- "\n",
419
- "=== Treniranje i evaluacija za trening skup: train_2 ===\n",
420
- "\n",
421
- "--- Fine-tuning model: classla/bcms-bertic ---\n"
422
- ]
423
- },
424
- {
425
- "name": "stderr",
426
- "output_type": "stream",
427
- "text": [
428
- "Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']\n",
429
- "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
430
- ]
431
- },
432
- {
433
- "data": {
434
- "application/vnd.jupyter.widget-view+json": {
435
- "model_id": "a7f4d6a07e7544b38dc869d92f60dbf0",
436
- "version_major": 2,
437
- "version_minor": 0
438
- },
439
- "text/plain": [
440
- "Map: 0%| | 0/2221 [00:00<?, ? examples/s]"
441
- ]
442
- },
443
- "metadata": {},
444
- "output_type": "display_data"
445
- },
446
- {
447
- "name": "stderr",
448
- "output_type": "stream",
449
- "text": [
450
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
451
- " warnings.warn(warn_msg)\n"
452
- ]
453
- },
454
- {
455
- "data": {
456
- "text/html": [
457
- "\n",
458
- " <div>\n",
459
- " \n",
460
- " <progress value='417' max='417' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
461
- " [417/417 09:50, Epoch 3/3]\n",
462
- " </div>\n",
463
- " <table border=\"1\" class=\"dataframe\">\n",
464
- " <thead>\n",
465
- " <tr style=\"text-align: left;\">\n",
466
- " <th>Step</th>\n",
467
- " <th>Training Loss</th>\n",
468
- " </tr>\n",
469
- " </thead>\n",
470
- " <tbody>\n",
471
- " <tr>\n",
472
- " <td>50</td>\n",
473
- " <td>0.998700</td>\n",
474
- " </tr>\n",
475
- " <tr>\n",
476
- " <td>100</td>\n",
477
- " <td>0.815300</td>\n",
478
- " </tr>\n",
479
- " <tr>\n",
480
- " <td>150</td>\n",
481
- " <td>0.685500</td>\n",
482
- " </tr>\n",
483
- " <tr>\n",
484
- " <td>200</td>\n",
485
- " <td>0.542600</td>\n",
486
- " </tr>\n",
487
- " <tr>\n",
488
- " <td>250</td>\n",
489
- " <td>0.520300</td>\n",
490
- " </tr>\n",
491
- " <tr>\n",
492
- " <td>300</td>\n",
493
- " <td>0.464000</td>\n",
494
- " </tr>\n",
495
- " <tr>\n",
496
- " <td>350</td>\n",
497
- " <td>0.380800</td>\n",
498
- " </tr>\n",
499
- " <tr>\n",
500
- " <td>400</td>\n",
501
- " <td>0.328300</td>\n",
502
- " </tr>\n",
503
- " </tbody>\n",
504
- "</table><p>"
505
- ],
506
- "text/plain": [
507
- "<IPython.core.display.HTML object>"
508
- ]
509
- },
510
- "metadata": {},
511
- "output_type": "display_data"
512
- },
513
- {
514
- "name": "stdout",
515
- "output_type": "stream",
516
- "text": [
517
- "\n",
518
- "Evaluacija na test skupu test-1\n"
519
- ]
520
- },
521
- {
522
- "data": {
523
- "application/vnd.jupyter.widget-view+json": {
524
- "model_id": "4816fe0de415465ca0d982ff0ddc9f1b",
525
- "version_major": 2,
526
- "version_minor": 0
527
- },
528
- "text/plain": [
529
- "Map: 0%| | 0/653 [00:00<?, ? examples/s]"
530
- ]
531
- },
532
- "metadata": {},
533
- "output_type": "display_data"
534
- },
535
- {
536
- "name": "stderr",
537
- "output_type": "stream",
538
- "text": [
539
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
540
- " warnings.warn(warn_msg)\n"
541
- ]
542
- },
543
- {
544
- "data": {
545
- "text/html": [],
546
- "text/plain": [
547
- "<IPython.core.display.HTML object>"
548
- ]
549
- },
550
- "metadata": {},
551
- "output_type": "display_data"
552
- },
553
- {
554
- "name": "stdout",
555
- "output_type": "stream",
556
- "text": [
557
- "Evaluacija: {'eval_loss': 0.8404552340507507, 'eval_accuracy': 0.6906584992343032, 'eval_f1_macro': 0.5999228826553304, 'eval_runtime': 15.1161, 'eval_samples_per_second': 43.199, 'eval_steps_per_second': 1.389, 'epoch': 3.0}\n"
558
- ]
559
- },
560
- {
561
- "name": "stderr",
562
- "output_type": "stream",
563
- "text": [
564
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
565
- " warnings.warn(warn_msg)\n"
566
- ]
567
- },
568
- {
569
- "name": "stdout",
570
- "output_type": "stream",
571
- "text": [
572
- "Confusion Matrix:\n",
573
- "[[116 42 7]\n",
574
- " [ 86 309 35]\n",
575
- " [ 7 25 26]]\n",
576
- "\n",
577
- "Classification Report:\n",
578
- " precision recall f1-score support\n",
579
- "\n",
580
- " negative 0.56 0.70 0.62 165\n",
581
- " neutral 0.82 0.72 0.77 430\n",
582
- " positive 0.38 0.45 0.41 58\n",
583
- "\n",
584
- " accuracy 0.69 653\n",
585
- " macro avg 0.59 0.62 0.60 653\n",
586
- "weighted avg 0.72 0.69 0.70 653\n",
587
- "\n",
588
- "Predikcije spremljene u results_train_2/predictions_test_1.csv\n",
589
- "\n",
590
- "Evaluacija na test skupu test-2\n"
591
- ]
592
- },
593
- {
594
- "data": {
595
- "application/vnd.jupyter.widget-view+json": {
596
- "model_id": "6459394251ff4731938cb87fdba6c9bd",
597
- "version_major": 2,
598
- "version_minor": 0
599
- },
600
- "text/plain": [
601
- "Map: 0%| | 0/741 [00:00<?, ? examples/s]"
602
- ]
603
- },
604
- "metadata": {},
605
- "output_type": "display_data"
606
- },
607
- {
608
- "name": "stderr",
609
- "output_type": "stream",
610
- "text": [
611
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
612
- " warnings.warn(warn_msg)\n"
613
- ]
614
- },
615
- {
616
- "data": {
617
- "text/html": [],
618
- "text/plain": [
619
- "<IPython.core.display.HTML object>"
620
- ]
621
- },
622
- "metadata": {},
623
- "output_type": "display_data"
624
- },
625
- {
626
- "name": "stdout",
627
- "output_type": "stream",
628
- "text": [
629
- "Evaluacija: {'eval_loss': 0.5182289481163025, 'eval_accuracy': 0.8083670715249662, 'eval_f1_macro': 0.7534545808339037, 'eval_runtime': 17.475, 'eval_samples_per_second': 42.403, 'eval_steps_per_second': 1.373, 'epoch': 3.0}\n"
630
- ]
631
- },
632
- {
633
- "name": "stderr",
634
- "output_type": "stream",
635
- "text": [
636
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
637
- " warnings.warn(warn_msg)\n"
638
- ]
639
- },
640
- {
641
- "name": "stdout",
642
- "output_type": "stream",
643
- "text": [
644
- "Confusion Matrix:\n",
645
- "[[163 44 9]\n",
646
- " [ 32 381 18]\n",
647
- " [ 10 29 55]]\n",
648
- "\n",
649
- "Classification Report:\n",
650
- " precision recall f1-score support\n",
651
- "\n",
652
- " negative 0.80 0.75 0.77 216\n",
653
- " neutral 0.84 0.88 0.86 431\n",
654
- " positive 0.67 0.59 0.62 94\n",
655
- "\n",
656
- " accuracy 0.81 741\n",
657
- " macro avg 0.77 0.74 0.75 741\n",
658
- "weighted avg 0.80 0.81 0.81 741\n",
659
- "\n",
660
- "Predikcije spremljene u results_train_2/predictions_test_2.csv\n",
661
- "\n",
662
- "Evaluacija na test skupu test-3\n"
663
- ]
664
- },
665
- {
666
- "data": {
667
- "application/vnd.jupyter.widget-view+json": {
668
- "model_id": "aed4ce93c6614d97abd44fe642c0b3fa",
669
- "version_major": 2,
670
- "version_minor": 0
671
- },
672
- "text/plain": [
673
- "Map: 0%| | 0/793 [00:00<?, ? examples/s]"
674
- ]
675
- },
676
- "metadata": {},
677
- "output_type": "display_data"
678
- },
679
- {
680
- "name": "stderr",
681
- "output_type": "stream",
682
- "text": [
683
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
684
- " warnings.warn(warn_msg)\n"
685
- ]
686
- },
687
- {
688
- "data": {
689
- "text/html": [],
690
- "text/plain": [
691
- "<IPython.core.display.HTML object>"
692
- ]
693
- },
694
- "metadata": {},
695
- "output_type": "display_data"
696
- },
697
- {
698
- "name": "stdout",
699
- "output_type": "stream",
700
- "text": [
701
- "Evaluacija: {'eval_loss': 0.9036539793014526, 'eval_accuracy': 0.7112232030264817, 'eval_f1_macro': 0.7055643128874013, 'eval_runtime': 18.7008, 'eval_samples_per_second': 42.405, 'eval_steps_per_second': 1.337, 'epoch': 3.0}\n"
702
- ]
703
- },
704
- {
705
- "name": "stderr",
706
- "output_type": "stream",
707
- "text": [
708
- "/Library/Frameworks/Python.framework/Versions/3.13/lib/python3.13/site-packages/torch/utils/data/dataloader.py:683: UserWarning: 'pin_memory' argument is set as true but not supported on MPS now, then device pinned memory won't be used.\n",
709
- " warnings.warn(warn_msg)\n"
710
- ]
711
- },
712
- {
713
- "name": "stdout",
714
- "output_type": "stream",
715
- "text": [
716
- "Confusion Matrix:\n",
717
- "[[204 53 10]\n",
718
- " [ 17 239 7]\n",
719
- " [ 13 129 121]]\n",
720
- "\n",
721
- "Classification Report:\n",
722
- " precision recall f1-score support\n",
723
- "\n",
724
- " negative 0.87 0.76 0.81 267\n",
725
- " neutral 0.57 0.91 0.70 263\n",
726
- " positive 0.88 0.46 0.60 263\n",
727
- "\n",
728
- " accuracy 0.71 793\n",
729
- " macro avg 0.77 0.71 0.71 793\n",
730
- "weighted avg 0.77 0.71 0.71 793\n",
731
- "\n",
732
- "Predikcije spremljene u results_train_2/predictions_test_3.csv\n"
733
- ]
734
- }
735
- ],
736
- "source": [
737
- "import pandas as pd\n",
738
- "import torch\n",
739
- "from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
740
- "from datasets import Dataset\n",
741
- "from sklearn.metrics import classification_report, confusion_matrix\n",
742
- "\n",
743
- "def load_and_prepare_data(train_path):\n",
744
- " df = pd.read_csv(train_path)\n",
745
- " df = df.rename(columns={\"Label\": \"label\"})\n",
746
- " return Dataset.from_pandas(df)\n",
747
- "\n",
748
- "def load_and_prepare_test_data(test_path):\n",
749
- " df = pd.read_csv(test_path)\n",
750
- " df = df.rename(columns={\"Label\": \"label\"})\n",
751
- " return Dataset.from_pandas(df), df\n",
752
- "\n",
753
- "def tokenize_dataset(dataset, tokenizer):\n",
754
- " def tokenize_function(examples):\n",
755
- " return tokenizer(examples['Sentence'], padding='max_length', truncation=True, max_length=128)\n",
756
- " tokenized = dataset.map(tokenize_function, batched=True)\n",
757
- " tokenized.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n",
758
- " return tokenized\n",
759
- "\n",
760
- "def compute_metrics(eval_pred):\n",
761
- " logits, labels = eval_pred\n",
762
- " preds = torch.argmax(torch.tensor(logits), axis=1).numpy()\n",
763
- " report = classification_report(labels, preds, output_dict=True)\n",
764
- " acc = report['accuracy']\n",
765
- " f1 = report['macro avg']['f1-score']\n",
766
- " return {'accuracy': acc, 'f1_macro': f1}\n",
767
- "\n",
768
- "def train_and_evaluate(model_name, train_dataset, test_datasets, raw_test_dfs, output_base_dir):\n",
769
- " print(f\"\\n--- Fine-tuning model: {model_name} ---\")\n",
770
- "\n",
771
- " tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
772
- " model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)\n",
773
- "\n",
774
- " tokenized_train = tokenize_dataset(train_dataset, tokenizer)\n",
775
- "\n",
776
- " training_args = TrainingArguments(\n",
777
- " output_dir=f\"{output_base_dir}/model\",\n",
778
- " learning_rate=2e-5,\n",
779
- " per_device_train_batch_size=16,\n",
780
- " per_device_eval_batch_size=32,\n",
781
- " num_train_epochs=3,\n",
782
- " weight_decay=0.01,\n",
783
- " load_best_model_at_end=False,\n",
784
- " logging_dir=f\"{output_base_dir}/logs\",\n",
785
- " logging_steps=50,\n",
786
- " save_total_limit=2,\n",
787
- " seed=42,\n",
788
- " )\n",
789
- "\n",
790
- " trainer = Trainer(\n",
791
- " model=model,\n",
792
- " args=training_args,\n",
793
- " train_dataset=tokenized_train,\n",
794
- " compute_metrics=compute_metrics,\n",
795
- " )\n",
796
- "\n",
797
- " # Treniraj model\n",
798
- " trainer.train()\n",
799
- "\n",
800
- " # Spremi model nakon treninga\n",
801
- " trainer.save_model()\n",
802
- "\n",
803
- " # Evaluiraj i predvidi na svakom test skupu\n",
804
- " for i, (test_dataset, raw_test_df) in enumerate(zip(test_datasets, raw_test_dfs), start=1):\n",
805
- " print(f\"\\nEvaluacija na test skupu test-{i}\")\n",
806
- "\n",
807
- " tokenized_test = tokenize_dataset(test_dataset, tokenizer)\n",
808
- " eval_results = trainer.evaluate(eval_dataset=tokenized_test)\n",
809
- " print(f\"Evaluacija: {eval_results}\")\n",
810
- "\n",
811
- " predictions_output = trainer.predict(tokenized_test)\n",
812
- " preds = torch.argmax(torch.tensor(predictions_output.predictions), axis=1).numpy()\n",
813
- " labels = predictions_output.label_ids\n",
814
- "\n",
815
- " print(\"Confusion Matrix:\")\n",
816
- " print(confusion_matrix(labels, preds))\n",
817
- "\n",
818
- " print(\"\\nClassification Report:\")\n",
819
- " print(classification_report(labels, preds, target_names=['negative', 'neutral', 'positive']))\n",
820
- "\n",
821
- " # Spremi predikcije u CSV\n",
822
- " output_df = raw_test_df.copy()\n",
823
- " output_df['predicted_label'] = preds\n",
824
- " output_df['correct'] = output_df['label'] == output_df['predicted_label']\n",
825
- " output_csv = f\"{output_base_dir}/predictions_test_{i}.csv\"\n",
826
- " output_df.to_csv(output_csv, index=False)\n",
827
- " print(f\"Predikcije spremljene u {output_csv}\")\n",
828
- "\n",
829
- "if __name__ == \"__main__\":\n",
830
- " # Učitaj trening skupove zasebno\n",
831
- " train_files = {\n",
832
- " \"train_combined\": \"TRAIN.csv\",\n",
833
- " \"train_2\": \"train-2.csv\"\n",
834
- " }\n",
835
- "\n",
836
- " # Učitaj test skupove\n",
837
- " test_files = [\"test-1.csv\", \"test-2.csv\", \"test-3.csv\"]\n",
838
- " test_datasets = []\n",
839
- " raw_test_dfs = []\n",
840
- " for f in test_files:\n",
841
- " ds, df = load_and_prepare_test_data(f)\n",
842
- " test_datasets.append(ds)\n",
843
- " raw_test_dfs.append(df)\n",
844
- "\n",
845
- " model_name = \"classla/bcms-bertic\"\n",
846
- "\n",
847
- " # Za svaki trening skup treniraj i evaluiraj model na sva tri testa\n",
848
- " for train_name, train_path in train_files.items():\n",
849
- " print(f\"\\n\\n=== Treniranje i evaluacija za trening skup: {train_name} ===\")\n",
850
- " train_dataset = load_and_prepare_data(train_path)\n",
851
- " output_dir = f\"results_{train_name}\"\n",
852
- " train_and_evaluate(model_name, train_dataset, test_datasets, raw_test_dfs, output_dir)\n"
853
- ]
854
- }
855
- ],
856
- "metadata": {
857
- "kernelspec": {
858
- "display_name": "Python 3",
859
- "language": "python",
860
- "name": "python3"
861
- },
862
- "language_info": {
863
- "codemirror_mode": {
864
- "name": "ipython",
865
- "version": 3
866
- },
867
- "file_extension": ".py",
868
- "mimetype": "text/x-python",
869
- "name": "python",
870
- "nbconvert_exporter": "python",
871
- "pygments_lexer": "ipython3",
872
- "version": "3.13.3"
873
- }
874
- },
875
- "nbformat": 4,
876
- "nbformat_minor": 5
877
- }