ynp3 commited on
Commit
e62a0d5
1 Parent(s): 36182b8

Delete finetuning.py

Browse files
Files changed (1) hide show
  1. finetuning.py +0 -239
finetuning.py DELETED
@@ -1,239 +0,0 @@
1
- import numpy as np
2
- import pandas as pd
3
- import os
4
- from tqdm.notebook import tqdm
5
- import pandas as pd
6
- from torch import cuda
7
- import torch
8
- import transformers
9
- from torch.utils.data import Dataset, DataLoader
10
- from transformers import DistilBertModel, DistilBertTokenizer
11
-
12
- device = 'cuda' if cuda.is_available() else 'cpu'
13
-
14
-
15
- label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
16
-
17
- df_train = pd.read_csv("train.csv")
18
- df_train.head(3)
19
-
20
- # hyperparameters
21
- MAX_LEN = 512
22
- TRAIN_BATCH_SIZE = 32
23
- VALID_BATCH_SIZE = 32
24
- EPOCHS = 2
25
- LEARNING_RATE = 1e-05
26
-
27
-
28
- df_train = df_train.sample(n=512)
29
- df_train.shape
30
-
31
-
32
- # Train Test Split
33
- train_size = 0.8
34
- df_train_sampled = df_train.sample(frac=train_size, random_state=44)
35
- df_val = df_train.drop(df_train_sampled.index).reset_index(drop=True)
36
- df_train_sampled = df_train_sampled.reset_index(drop=True)
37
- print()
38
- df_train_sampled.shape, df_val.shape
39
-
40
- model_name = 'distilbert-base-uncased'
41
-
42
- tokenizer = DistilBertTokenizer.from_pretrained(model_name, do_lower_case=True)
43
-
44
-
45
- # Custom Dataset
46
-
47
- class ToxicDataset(Dataset):
48
- def __init__(self, data, tokenizer, max_len):
49
- self.data = data
50
- self.tokenizer = tokenizer
51
- self.max_len = max_len
52
- self.labels = self.data[label_cols].values
53
-
54
- def __len__(self):
55
- return len(self.data.id)
56
-
57
- def __getitem__(self, idx):
58
- text = self.data.comment_text
59
- tokenized_text = self.tokenizer.encode_plus(
60
- str( text ),
61
- None,
62
- add_special_tokens=True,
63
- max_length=self.max_len,
64
- padding='max_length',
65
- return_token_type_ids=True,
66
- truncation=True,
67
- return_attention_mask=True,
68
- return_tensors='pt'
69
- )
70
-
71
- return {
72
- 'input_ids': tokenized_text['input_ids'].flatten(),
73
- 'attention_mask': tokenized_text['attention_mask'].flatten(),
74
- 'targets': torch.FloatTensor(self.labels[idx])
75
- }
76
-
77
-
78
- train_dataset = ToxicDataset(df_train_sampled, tokenizer, MAX_LEN)
79
- valid_dataset = ToxicDataset(df_val, tokenizer, MAX_LEN)
80
-
81
-
82
- train_data_loader = torch.utils.data.DataLoader(train_dataset,
83
- batch_size=TRAIN_BATCH_SIZE,
84
- shuffle=True,
85
- num_workers=0
86
- )
87
-
88
- val_data_loader = torch.utils.data.DataLoader(valid_dataset,
89
- batch_size=VALID_BATCH_SIZE,
90
- shuffle=False,
91
- num_workers=0
92
- )
93
-
94
-
95
- # # Custom Model Class
96
-
97
- class CustomDistilBertClass(torch.nn.Module):
98
- def __init__(self):
99
- super(CustomDistilBertClass, self).__init__()
100
- self.distilbert_model = DistilBertModel.from_pretrained(model_name, return_dict=True)
101
- self.dropout = torch.nn.Dropout(0.3)
102
- self.linear = torch.nn.Linear(768, 6)
103
-
104
- def forward(self, input_ids, attn_mask):
105
- output = self.distilbert_model(
106
- input_ids,
107
- attention_mask=attn_mask,
108
- )
109
- output_dropout = self.dropout(output.last_hidden_state)
110
- output = self.linear(output_dropout)
111
- return output
112
-
113
- model = CustomDistilBertClass()
114
- model.to(device)
115
- print()
116
-
117
-
118
- def loss_fn(outputs, targets):
119
- return torch.nn.BCEWithLogitsLoss()(outputs, targets)
120
-
121
-
122
- optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)
123
-
124
-
125
- def train_model(n_epochs, training_loader, validation_loader, model,
126
- optimizer, checkpoint_path, best_model_path):
127
-
128
- valid_loss_min = np.Inf
129
-
130
- for epoch in range(1, n_epochs+1):
131
- train_loss = 0
132
- valid_loss = 0
133
-
134
- model.train()
135
- print(' Epoch {}: START Training '.format(epoch))
136
- for batch_idx, data in enumerate(training_loader):
137
- ids = data['input_ids'].to(device, dtype = torch.long)
138
- mask = data['attention_mask'].to(device, dtype = torch.long)
139
-
140
- outputs = model(ids, mask, )
141
- outputs = outputs[:, 0, :]
142
- targets = data['targets'].to(device, dtype = torch.float)
143
- loss = loss_fn(outputs, targets)
144
-
145
- optimizer.zero_grad()
146
- loss.backward()
147
- optimizer.step()
148
-
149
- train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
150
-
151
- print(' Epoch {}: END Training '.format(epoch))
152
-
153
- print(' Epoch {}: START Validation '.format(epoch))
154
-
155
- model.eval()
156
-
157
- with torch.no_grad():
158
- for batch_idx, data in enumerate(validation_loader, 0):
159
- ids = data['input_ids'].to(device, dtype = torch.long)
160
- mask = data['attention_mask'].to(device, dtype = torch.long)
161
-
162
- targets = data['targets'].to(device, dtype = torch.float)
163
- outputs = model(ids, mask, )
164
- outputs = outputs[:, 0, :]
165
- loss = loss_fn(outputs, targets)
166
-
167
- valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
168
-
169
- print(' Epoch {}: END Validation '.format(epoch))
170
- train_loss = train_loss/len(training_loader)
171
- valid_loss = valid_loss/len(validation_loader)
172
- print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
173
- epoch,
174
- train_loss,
175
- valid_loss
176
- ))
177
-
178
- # create checkpoint variable and add important data
179
- checkpoint = {
180
- 'epoch': epoch + 1,
181
- 'valid_loss_min': valid_loss,
182
- 'state_dict': model.state_dict(),
183
- 'optimizer': optimizer.state_dict()
184
- }
185
-
186
- save_ckp(checkpoint, False, checkpoint_path, best_model_path)
187
-
188
- if valid_loss <= valid_loss_min:
189
- print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,valid_loss))
190
- # save checkpoint as best model
191
- save_ckp(checkpoint, True, checkpoint_path, best_model_path)
192
- valid_loss_min = valid_loss
193
-
194
- print(' Epoch {} Done \n'.format(epoch))
195
-
196
- return model
197
-
198
- # %%
199
- import shutil
200
-
201
- def load_ckp(checkpoint_fpath, model, optimizer):
202
- """
203
- checkpoint_path: path to save checkpoint
204
- model: model that we want to load checkpoint parameters into
205
- optimizer: optimizer we defined in previous training
206
- """
207
-
208
- checkpoint = torch.load(checkpoint_fpath)
209
- model.load_state_dict(checkpoint['state_dict'])
210
- optimizer.load_state_dict(checkpoint['optimizer'])
211
- valid_loss_min = checkpoint['valid_loss_min']
212
- return model, optimizer, checkpoint['epoch'], valid_loss_min.item()
213
-
214
- def save_ckp(state, is_best, checkpoint_path, best_model_path):
215
- """
216
- state: checkpoint we want to save
217
- is_best: is this the best checkpoint; min validation loss
218
- checkpoint_path: path to save checkpoint
219
- best_model_path: path to save best model
220
- """
221
- f_path = checkpoint_path
222
- torch.save(state, f_path)
223
- if is_best:
224
- best_fpath = best_model_path
225
- shutil.copyfile(f_path, best_fpath)
226
-
227
-
228
- ckpt_path = "model.pt"
229
- best_model_path = "best_model.pt"
230
-
231
-
232
- trained_model = train_model(EPOCHS,
233
- train_data_loader,
234
- val_data_loader,
235
- model,
236
- optimizer,
237
- ckpt_path,
238
- best_model_path)
239
-