jakeh311 commited on
Commit
9d65ca7
·
verified ·
1 Parent(s): 493f12f

Upload Sentence Classification Model.py

Browse files

Sentence Classification Model, utilizing BERT and DISTILBERT models through fine-tuning, to classify sentence sentiment.

Files changed (1) hide show
  1. Sentence Classification Model.py +151 -0
Sentence Classification Model.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Importing the correct libraries.
2
+
3
+ !pip install datasets huggingface_hub
4
+ import torch
5
+ from datasets import load_dataset
6
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
7
+ from torch.utils.data import DataLoader
8
+ from sklearn.metrics import accuracy_score, f1_score
9
+
10
+ # Importing the datasets and defining a function for preprocessing.
11
+
12
+ sst2 = load_dataset("stanfordnlp/sst2")
13
+ imdb = load_dataset("stanfordnlp/imdb")
14
+
15
+ def data_preprocessor_sst2(data, token):
16
+ return data.map(lambda x: token(x['sentence'], truncation = True, padding = 'max_length', max_length = 128), batched = True)
17
+ def data_preprocessor_imdb(data, token):
18
+ return data.map(lambda x: token(x['text'], truncation = True, padding = 'max_length', max_length = 128), batched = True)
19
+
20
+ # Loading Bert and Distilbert tokenizers to tokenize the datasets.
21
+
22
+ token_B = AutoTokenizer.from_pretrained("bert-base-uncased")
23
+ token_DB = AutoTokenizer.from_pretrained("distilbert-base-uncased")
24
+
25
+ train_sst2 = data_preprocessor_sst2(sst2['train'], token_B)
26
+ test_sst2 = data_preprocessor_sst2(sst2['validation'], token_B)
27
+ test_imdb = data_preprocessor_imdb(imdb['test'], token_B)
28
+
29
+ # Formatting the datasets to fit the model and transformers.
30
+
31
+ train_sst2.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
32
+ test_sst2.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
33
+ test_imdb.set_format(type = 'torch', columns = ['input_ids', 'attention_mask', 'label'])
34
+
35
+ # Creating a data loader for each of the splits.
36
+
37
+ def data_loader(data, batch = 32):
38
+ return DataLoader(data, batch_size = batch, shuffle = True)
39
+
40
+ train_DL = data_loader(train_sst2)
41
+ test_DL = data_loader(test_sst2)
42
+ test_DL_imdb = data_loader(test_imdb)
43
+
44
+ # Loading the Bert and Distilbert models.
45
+
46
+ model_B = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels = 2)
47
+ model_DB = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)
48
+
49
+ # Building a function to train and optimize the model.
50
+
51
+ def model_trainer(model, data, epochs = 1):
52
+ gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
53
+ model.to(gpu)
54
+ optimizer = AdamW(model.parameters(), lr = 0.00005)
55
+ model.train()
56
+ for x in range(epochs):
57
+ loss_total = 0
58
+ for y in data:
59
+ optimizer.zero_grad()
60
+ input_ids, attention_mask, labels = (y['input_ids'].to(gpu), y['attention_mask'].to(gpu), y['label'].to(gpu))
61
+ forward_info = model(input_ids, attention_mask = attention_mask, labels = labels)
62
+ forward_loss = forward_info.loss
63
+ loss_total += forward_loss.item()
64
+ forward_loss.backward()
65
+ optimizer.step()
66
+ print(f"Epoch - Loss = {loss_total / len(data)}")
67
+ return model
68
+
69
+ # Building a function to evaluate the model.
70
+
71
+ def model_evaluator(model, data):
72
+ model.eval()
73
+ gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
74
+ model.to(gpu)
75
+ preds, trues = [], []
76
+ with torch.no_grad():
77
+ for y in data:
78
+ input_ids, attention_mask, labels = (y['input_ids'].to(gpu), y['attention_mask'].to(gpu), y['label'].to(gpu))
79
+ forward_info = model(input_ids, attention_mask = attention_mask)
80
+ forward_loss = forward_info.logits
81
+ preds.extend(torch.argmax(forward_loss, dim = 1).cpu().numpy())
82
+ trues.extend(labels.cpu().numpy())
83
+ score_acc = accuracy_score(trues, preds)
84
+ score_f1 = f1_score(trues, preds)
85
+ return score_acc, score_f1
86
+
87
+ # Training the Bert and Distilbert models.
88
+
89
+ model_B = model_trainer(model_B, train_DL)
90
+ model_DB = model_trainer(model_DB, train_DL)
91
+
92
+ # Evaluating the scores of the models on the SST2 dataset.
93
+
94
+ bert_acc_sst2, bert_f1_sst2 = model_evaluator(model_B, test_DL)
95
+ dist_acc_sst2, dist_f1_sst2 = model_evaluator(model_DB, test_DL)
96
+
97
+ print(f"Bert Accuracy: {bert_acc_sst2}, Bert F1: {bert_f1_sst2}")
98
+ print(f"Distilbert Accuracy: {dist_acc_sst2}, Distilbert F1: {dist_f1_sst2}")
99
+
100
+ # Evaluating the scores of the models on the IMDB dataset.
101
+
102
+ bert_acc_imdb, bert_f1_imdb = model_evaluator(model_B, test_DL_imdb)
103
+ dist_acc_imdb, dist_f1_imdb = model_evaluator(model_DB, test_DL_imdb)
104
+
105
+ print(f"Bert Accuracy: {bert_acc_imdb}, Bert F1: {bert_f1_imdb}")
106
+ print(f"Distilbert Accuracy: {dist_acc_imdb}, Distilbert F1: {dist_f1_imdb}")
107
+
108
+ # Defining a function to classify the sentences
109
+
110
+ def classify(sentence: str, model, token):
111
+ gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
112
+ model.to(gpu)
113
+ model.eval()
114
+ inputs = token(sentence, return_tensors = "pt", truncation = True, padding = True, max_length = 128).to(gpu)
115
+ with torch.no_grad():
116
+ forward_info = model(**inputs)
117
+ pred = forward_info.logits.argmax(dim = 1).item()
118
+ return pred
119
+
120
+ # Testing sentence 1.
121
+
122
+ sentence1 = "for all its highfalutin title and corkscrew narrative , the movie turns out to be not much more than a shaggy human tale ."
123
+ print(f"BERT Prediction: {classify(sentence1, model_B, token_B)}")
124
+ print(f"DistilBERT Prediction: {classify(sentence1, model_DB, token_DB)}")
125
+
126
+ # Testing sentence 2.
127
+
128
+ sentence2 = "its underlying mythology is a hodgepodge of inconsistencies that pose the question : since when did dumb entertainment have to be this dumb ?"
129
+ print(f"BERT Prediction: {classify(sentence2, model_B, token_B)}")
130
+ print(f"DistilBERT Prediction: {classify(sentence2, model_DB, token_DB)}")
131
+
132
+ # Testing sentence 3.
133
+
134
+ sentence3 = '''
135
+ the actors do n’t inhabit their roles– they ’re trapped by them ,
136
+ forced to change behavior in bizarre unjustified fashion and spout dialog that consists mostly of platitudes .
137
+ '''
138
+ print(f"BERT Prediction: {classify(sentence3, model_B, token_B)}")
139
+ print(f"DistilBERT Prediction: {classify(sentence3, model_DB, token_DB)}")
140
+
141
+ # Testing sentence 4.
142
+
143
+ sentence4 = "an absorbing trip into the minds and motivations of people under stress as well as a keen , unsentimental look at variations on the theme of motherhood ."
144
+ print(f"BERT Prediction: {classify(sentence4, model_B, token_B)}")
145
+ print(f"DistilBERT Prediction: {classify(sentence4, model_DB, token_DB)}")
146
+
147
+ # Testing sentence 5.
148
+
149
+ sentence5 = "one of those rare , exhilarating cinematic delights that gets even better in hindsight , as you mull over its every nuance in your mind ."
150
+ print(f"BERT Prediction: {classify(sentence5, model_B, token_B)}")
151
+ print(f"DistilBERT Prediction: {classify(sentence5, model_DB, token_DB)}")