MarieAngeA13 commited on
Commit
cbd9375
·
1 Parent(s): 7de4352

Delete copie_de_08_sentiment_analysis_with_bert.py

Browse files
copie_de_08_sentiment_analysis_with_bert.py DELETED
@@ -1,514 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- """Copie_de_08_sentiment_analysis_with_bert.ipynb
3
-
4
- Automatically generated by Colaboratory.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1zHnnWVxTXMeLoDe2L-hV_LzK6S7Flgps
8
- """
9
-
10
- !nvidia-smi
11
-
12
- """## Setup
13
-
14
- We'll need [the Transformers library](https://huggingface.co/transformers/) by Hugging Face:
15
- """
16
-
17
- !pip install -q -U watermark
18
-
19
- !pip install -qq transformers
20
-
21
- # Commented out IPython magic to ensure Python compatibility.
22
- # %reload_ext watermark
23
- # %watermark -v -p numpy,pandas,torch,transformers
24
-
25
- # Commented out IPython magic to ensure Python compatibility.
26
- #@title Setup & Config
27
- import transformers
28
- from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
29
- import torch
30
-
31
- import numpy as np
32
- import pandas as pd
33
- import seaborn as sns
34
- from pylab import rcParams
35
- import matplotlib.pyplot as plt
36
- from matplotlib import rc
37
- from sklearn.model_selection import train_test_split
38
- from sklearn.metrics import confusion_matrix, classification_report
39
- from collections import defaultdict
40
- from textwrap import wrap
41
-
42
- from torch import nn, optim
43
- from torch.utils.data import Dataset, DataLoader
44
- import torch.nn.functional as F
45
-
46
- # %matplotlib inline
47
- # %config InlineBackend.figure_format='retina'
48
-
49
- sns.set(style='whitegrid', palette='muted', font_scale=1.2)
50
-
51
- HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
52
-
53
- sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
54
-
55
- rcParams['figure.figsize'] = 12, 8
56
-
57
- RANDOM_SEED = 42
58
- np.random.seed(RANDOM_SEED)
59
- torch.manual_seed(RANDOM_SEED)
60
-
61
- device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
62
- device
63
-
64
- !gdown --id 1S6qMioqPJjyBLpLVz4gmRTnJHnjitnuV
65
- !gdown --id 1zdmewp7ayS4js4VtrJEHzAheSW-5NBZv
66
-
67
- df = pd.read_csv("reviews.csv")
68
- df.head()
69
-
70
- df.shape
71
-
72
- df.info()
73
-
74
- print(df.score)
75
-
76
- sns.countplot(x='score', data = df)
77
- plt.xlabel('review score');
78
-
79
- def to_sentiment(rating):
80
- rating = int(rating)
81
- if rating <= 2:
82
- return 0
83
- elif rating == 3:
84
- return 1
85
- else:
86
- return 2
87
-
88
- df['sentiment'] = df.score.apply(to_sentiment)
89
-
90
- class_names = ['negative', 'neutral', 'positive']
91
-
92
- print(df.sentiment)
93
-
94
- ax = sns.countplot(x='sentiment', data = df)
95
- plt.xlabel('review sentiment')
96
- ax.set_xticklabels(class_names);
97
-
98
- PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
99
-
100
- tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
101
-
102
- sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'
103
-
104
- tokens = tokenizer.tokenize(sample_txt)
105
- token_ids = tokenizer.convert_tokens_to_ids(tokens)
106
-
107
- print(f' Sentence: {sample_txt}')
108
- print(f' Tokens: {tokens}')
109
- print(f'Token IDs: {token_ids}')
110
-
111
- tokenizer.sep_token, tokenizer.sep_token_id
112
-
113
- tokenizer.cls_token, tokenizer.cls_token_id
114
-
115
- tokenizer.pad_token, tokenizer.pad_token_id
116
-
117
- tokenizer.unk_token, tokenizer.unk_token_id
118
-
119
- encoding = tokenizer.encode_plus(
120
- sample_txt,
121
- max_length=32,
122
- add_special_tokens=True, # Add '[CLS]' and '[SEP]'
123
- return_token_type_ids=False,
124
- pad_to_max_length=True,
125
- return_attention_mask=True,
126
- return_tensors='pt', # Return PyTorch tensors
127
- )
128
-
129
- encoding.keys()
130
-
131
- print(len(encoding['input_ids'][0]))
132
- encoding['input_ids'][0]
133
-
134
- print(len(encoding['attention_mask'][0]))
135
- encoding['attention_mask']
136
-
137
- tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])
138
-
139
- token_lens = []
140
-
141
- for txt in df.content:
142
- tokens = tokenizer.encode(txt, max_length=512)
143
- token_lens.append(len(tokens))
144
-
145
- sns.distplot(token_lens)
146
- plt.xlim([0, 256]);
147
- plt.xlabel('Token count');
148
-
149
- MAX_LEN = 160
150
-
151
- class GPReviewDataset(Dataset):
152
-
153
- def __init__(self, reviews, targets, tokenizer, max_len):
154
- self.reviews = reviews
155
- self.targets = targets
156
- self.tokenizer = tokenizer
157
- self.max_len = max_len
158
-
159
- def __len__(self):
160
- return len(self.reviews)
161
-
162
- def __getitem__(self, item):
163
- review = str(self.reviews[item])
164
- target = self.targets[item]
165
-
166
- encoding = self.tokenizer.encode_plus(
167
- review,
168
- add_special_tokens=True,
169
- max_length=self.max_len,
170
- return_token_type_ids=False,
171
- pad_to_max_length=True,
172
- return_attention_mask=True,
173
- return_tensors='pt',
174
- )
175
-
176
- return {
177
- 'review_text': review,
178
- 'input_ids': encoding['input_ids'].flatten(),
179
- 'attention_mask': encoding['attention_mask'].flatten(),
180
- 'targets': torch.tensor(target, dtype=torch.long)
181
- }
182
-
183
- df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
184
- df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)
185
-
186
- df_train.shape, df_val.shape, df_test.shape
187
-
188
- def create_data_loader(df, tokenizer, max_len, batch_size):
189
- ds = GPReviewDataset(
190
- reviews=df.content.to_numpy(),
191
- targets=df.sentiment.to_numpy(),
192
- tokenizer=tokenizer,
193
- max_len=max_len
194
- )
195
-
196
- return DataLoader(
197
- ds,
198
- batch_size=batch_size,
199
- num_workers=4
200
- )
201
-
202
- BATCH_SIZE = 16
203
-
204
- train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
205
- val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
206
- test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)
207
-
208
- data = next(iter(train_data_loader))
209
- data.keys()
210
-
211
- print(data['input_ids'].shape)
212
- print(data['attention_mask'].shape)
213
- print(data['targets'].shape)
214
-
215
- bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
216
-
217
- last_hidden_state, pooled_output = bert_model(
218
- input_ids=encoding['input_ids'],
219
- attention_mask=encoding['attention_mask'],
220
- return_dict = False
221
- )
222
-
223
- last_hidden_state.shape
224
-
225
- bert_model.config.hidden_size
226
-
227
- pooled_output.shape
228
-
229
- class SentimentClassifier(nn.Module):
230
-
231
- def __init__(self, n_classes):
232
- super(SentimentClassifier, self).__init__()
233
- self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
234
- self.drop = nn.Dropout(p=0.3)
235
- self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
236
-
237
- def forward(self, input_ids, attention_mask):
238
- returned = self.bert(
239
- input_ids=input_ids,
240
- attention_mask=attention_mask
241
- )
242
- pooled_output = returned["pooler_output"]
243
- output = self.drop(pooled_output)
244
- return self.out(output)
245
-
246
- model = SentimentClassifier(len(class_names))
247
- model = model.to(device)
248
-
249
- input_ids = data['input_ids'].to(device)
250
- attention_mask = data['attention_mask'].to(device)
251
-
252
- print(input_ids.shape) # batch size x seq length
253
- print(attention_mask.shape) # batch size x seq length
254
-
255
- F.softmax(model(input_ids, attention_mask), dim=1)
256
-
257
- """### Training"""
258
-
259
- EPOCHS = 6
260
-
261
- optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
262
- total_steps = len(train_data_loader) * EPOCHS
263
-
264
- scheduler = get_linear_schedule_with_warmup(
265
- optimizer,
266
- num_warmup_steps=0,
267
- num_training_steps=total_steps
268
- )
269
-
270
- loss_fn = nn.CrossEntropyLoss().to(device)
271
-
272
- def train_epoch(
273
- model,
274
- data_loader,
275
- loss_fn,
276
- optimizer,
277
- device,
278
- scheduler,
279
- n_examples
280
- ):
281
- model = model.train()
282
-
283
- losses = []
284
- correct_predictions = 0
285
-
286
- for d in data_loader:
287
- input_ids = d["input_ids"].to(device)
288
- attention_mask = d["attention_mask"].to(device)
289
- targets = d["targets"].to(device)
290
-
291
- outputs = model(
292
- input_ids=input_ids,
293
- attention_mask=attention_mask
294
- )
295
-
296
- _, preds = torch.max(outputs, dim=1)
297
- loss = loss_fn(outputs, targets)
298
-
299
- correct_predictions += torch.sum(preds == targets)
300
- losses.append(loss.item())
301
-
302
- loss.backward()
303
- nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
304
- optimizer.step()
305
- scheduler.step()
306
- optimizer.zero_grad()
307
-
308
- return correct_predictions.double() / n_examples, np.mean(losses)
309
-
310
- def eval_model(model, data_loader, loss_fn, device, n_examples):
311
- model = model.eval()
312
-
313
- losses = []
314
- correct_predictions = 0
315
-
316
- with torch.no_grad():
317
- for d in data_loader:
318
- input_ids = d["input_ids"].to(device)
319
- attention_mask = d["attention_mask"].to(device)
320
- targets = d["targets"].to(device)
321
-
322
- outputs = model(
323
- input_ids=input_ids,
324
- attention_mask=attention_mask
325
- )
326
- _, preds = torch.max(outputs, dim=1)
327
-
328
- loss = loss_fn(outputs, targets)
329
-
330
- correct_predictions += torch.sum(preds == targets)
331
- losses.append(loss.item())
332
-
333
- return correct_predictions.double() / n_examples, np.mean(losses)
334
-
335
- # Commented out IPython magic to ensure Python compatibility.
336
- # %%time
337
- #
338
- # history = defaultdict(list)
339
- # best_accuracy = 0
340
- #
341
- # for epoch in range(EPOCHS):
342
- #
343
- # print(f'Epoch {epoch + 1}/{EPOCHS}')
344
- # print('-' * 10)
345
- #
346
- # train_acc, train_loss = train_epoch(
347
- # model,
348
- # train_data_loader,
349
- # loss_fn,
350
- # optimizer,
351
- # device,
352
- # scheduler,
353
- # len(df_train)
354
- # )
355
- #
356
- # print(f'Train loss {train_loss} accuracy {train_acc}')
357
- #
358
- # val_acc, val_loss = eval_model(
359
- # model,
360
- # val_data_loader,
361
- # loss_fn,
362
- # device,
363
- # len(df_val)
364
- # )
365
- #
366
- # print(f'Val loss {val_loss} accuracy {val_acc}')
367
- # print()
368
- #
369
- # history['train_acc'].append(train_acc)
370
- # history['train_loss'].append(train_loss)
371
- # history['val_acc'].append(val_acc)
372
- # history['val_loss'].append(val_loss)
373
- #
374
- # if val_acc > best_accuracy:
375
- # torch.save(model.state_dict(), 'best_model_state.bin')
376
- # best_accuracy = val_acc
377
-
378
- print(history['train_acc'])
379
-
380
- list_of_train_accuracy= [t.cpu().numpy() for t in history['train_acc']]
381
- list_of_train_accuracy
382
-
383
- print(history['val_acc'])
384
-
385
- list_of_val_accuracy= [t.cpu().numpy() for t in history['val_acc']]
386
- list_of_val_accuracy
387
-
388
- plt.plot(list_of_train_accuracy, label='train accuracy')
389
- plt.plot(list_of_val_accuracy, label='validation accuracy')
390
-
391
- plt.title('Training history')
392
- plt.ylabel('Accuracy')
393
- plt.xlabel('Epoch')
394
- plt.legend()
395
- plt.ylim([0, 1]);
396
-
397
- # !gdown --id 1V8itWtowCYnb2Bc9KlK9SxGff9WwmogA
398
-
399
- # model = SentimentClassifier(len(class_names))
400
- # model.load_state_dict(torch.load('best_model_state.bin'))
401
- # model = model.to(device)
402
-
403
- test_acc, _ = eval_model(
404
- model,
405
- test_data_loader,
406
- loss_fn,
407
- device,
408
- len(df_test)
409
- )
410
-
411
- print(('\n'))
412
- print('Test Accuracy : ', test_acc.item())
413
-
414
- def get_predictions(model, data_loader):
415
- model = model.eval()
416
-
417
- review_texts = []
418
- predictions = []
419
- prediction_probs = []
420
- real_values = []
421
-
422
- with torch.no_grad():
423
- for d in data_loader:
424
-
425
- texts = d["review_text"]
426
- input_ids = d["input_ids"].to(device)
427
- attention_mask = d["attention_mask"].to(device)
428
- targets = d["targets"].to(device)
429
-
430
- outputs = model(
431
- input_ids=input_ids,
432
- attention_mask=attention_mask
433
- )
434
- _, preds = torch.max(outputs, dim=1)
435
-
436
- probs = F.softmax(outputs, dim=1)
437
-
438
- review_texts.extend(texts)
439
- predictions.extend(preds)
440
- prediction_probs.extend(probs)
441
- real_values.extend(targets)
442
-
443
- predictions = torch.stack(predictions).cpu()
444
- prediction_probs = torch.stack(prediction_probs).cpu()
445
- real_values = torch.stack(real_values).cpu()
446
- return review_texts, predictions, prediction_probs, real_values
447
-
448
- y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
449
- model,
450
- test_data_loader
451
- )
452
-
453
- print(classification_report(y_test, y_pred, target_names=class_names))
454
-
455
- def show_confusion_matrix(confusion_matrix):
456
- hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
457
- hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
458
- hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
459
- plt.ylabel('True sentiment')
460
- plt.xlabel('Predicted sentiment');
461
-
462
- cm = confusion_matrix(y_test, y_pred)
463
- df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
464
- show_confusion_matrix(df_cm)
465
-
466
- idx = 2
467
-
468
- review_text = y_review_texts[idx]
469
- true_sentiment = y_test[idx]
470
- pred_df = pd.DataFrame({
471
- 'class_names': class_names,
472
- 'values': y_pred_probs[idx]
473
- })
474
-
475
- print("\n".join(wrap(review_text)))
476
- print()
477
- print(f'True sentiment: {class_names[true_sentiment]}')
478
-
479
- sns.barplot(x='values', y='class_names', data=pred_df, orient='h')
480
- plt.ylabel('sentiment')
481
- plt.xlabel('probability')
482
- plt.xlim([0, 1]);
483
-
484
- review_text = "I hate you!!!"
485
-
486
- encoded_review = tokenizer.encode_plus(
487
- review_text,
488
- max_length=MAX_LEN,
489
- add_special_tokens=True,
490
- return_token_type_ids=False,
491
- pad_to_max_length=True,
492
- return_attention_mask=True,
493
- return_tensors='pt',
494
- )
495
-
496
- input_ids = encoded_review['input_ids'].to(device)
497
- attention_mask = encoded_review['attention_mask'].to(device)
498
-
499
- output = model(input_ids, attention_mask)
500
- _, prediction = torch.max(output, dim=1)
501
-
502
- print(f'Review text: {review_text}')
503
- print(f'Sentiment : {class_names[prediction]}')
504
-
505
- """## References
506
-
507
- - [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
508
- - [L11 Language Models - Alec Radford (OpenAI)](https://www.youtube.com/watch?v=BnpB3GrpsfM)
509
- - [The Illustrated BERT, ELMo, and co.](https://jalammar.github.io/illustrated-bert/)
510
- - [BERT Fine-Tuning Tutorial with PyTorch](https://mccormickml.com/2019/07/22/BERT-fine-tuning/)
511
- - [How to Fine-Tune BERT for Text Classification?](https://arxiv.org/pdf/1905.05583.pdf)
512
- - [Huggingface Transformers](https://huggingface.co/transformers/)
513
- - [BERT Explained: State of the art language model for NLP](https://towardsdatascience.com/bert-explained-state-of-the-art-language-model-for-nlp-f8b21a9b6270)
514
- """