thotran commited on
Commit
65931d3
1 Parent(s): a8995a8

requirements fix

Browse files
Files changed (1) hide show
  1. app.py +206 -0
app.py CHANGED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.optim as optim
6
+ import torch.nn.functional as F
7
+ from torch.utils.data import TensorDataset, DataLoader, Dataset
8
+ from sklearn.metrics import roc_auc_score
9
+ import re
10
+ from tqdm.notebook import tqdm
11
+ from typing import *
12
+ import string
13
+ from sklearn.model_selection import train_test_split
14
+ from transformers import DistilBertTokenizer, AdamW
15
+ from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification
16
+ import streamlit as st
17
+ st.write("Please be patient model training takes 20+ mins :P")
18
+ #config constants
19
+ SEED = 42
20
+ EPOCHS = 2
21
+ SEQ_SIZE = 150
22
+ BATCH_SIZE = 32
23
+ PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"
24
+
25
+ #import all data
26
+ data=pd.read_csv('./data/train.csv',engine='python',encoding='utf-8', error_bad_lines=False)
27
+ test=pd.read_csv('./data/test.csv',engine='python',encoding='utf-8', error_bad_lines=False)
28
+ test_labels=pd.read_csv('./data/test_labels.csv',engine='python',encoding='utf-8', error_bad_lines=False)
29
+ sub=pd.read_csv('./data/sample_submission.csv',engine='python',encoding='utf-8', error_bad_lines=False)
30
+
31
+ #setup data
32
+ data.drop(columns='id',inplace=True)
33
+ labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
34
+
35
+ #text proccessing
36
+ def cleanString(comment: str) -> str:
37
+ #contrationcs
38
+ comment = re.sub('n\'t', ' not', comment)
39
+ comment = re.sub('\'m', ' am', comment)
40
+ comment = re.sub('\'ve', ' have', comment)
41
+ comment = re.sub('\'s', ' is', comment)
42
+ #newline
43
+ comment = comment.replace('\n', ' \n ')
44
+ comment = comment.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
45
+ comment = comment.replace(r'[0-9]', '')
46
+ comment = re.sub('[^a-zA-Z%]', ' ', comment)
47
+ comment = re.sub('%', '', comment)
48
+ comment = re.sub(r' +', ' ', comment)
49
+ comment = re.sub(r'\n', ' ', comment)
50
+ comment = re.sub(r' +', ' ', comment)
51
+ comment = comment.strip()
52
+ return comment
53
+
54
+ data.comment_text=data.comment_text.map(cleanString)
55
+
56
+ #tokenizer
57
+ tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
58
+
59
+ token_lens = []
60
+
61
+ for txt in tqdm(data.comment_text):
62
+ tokens = tokenizer.encode(txt, max_length=512)
63
+ token_lens.append(len(tokens))
64
+
65
+ #test train split
66
+ df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
67
+ df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
68
+ #set pytorch dataset
69
+ class CommentDataset(Dataset):
70
+ def __init__(self, comments, targets, tokenizer, max_len):
71
+ assert len(comments) == len(targets)
72
+ self.comments = comments
73
+ self.targets = targets
74
+ self.tokenizer = tokenizer
75
+ self.max_len = max_len
76
+
77
+ def __len__(self):
78
+ return len(self.comments)
79
+
80
+ def __getitem__(self, item):
81
+ comment = str(self.comments[item])
82
+ target = self.targets[item]
83
+
84
+ encoding = self.tokenizer.encode_plus(comment,
85
+ add_special_tokens=True,
86
+ max_length=self.max_len,
87
+ return_token_type_ids=False,
88
+ pad_to_max_length=True,
89
+ # padding='max_length',
90
+ return_attention_mask=True,
91
+ return_tensors='pt',
92
+ )
93
+ return {'review_text': comment,
94
+ 'input_ids': encoding['input_ids'].flatten(),
95
+ 'attention_mask': encoding['attention_mask'].flatten(),
96
+ 'targets': torch.tensor(target, dtype=torch.long)}
97
+
98
+ def create_data_loader(df: pd.DataFrame, tokenizer, max_len: int, batch_size: int):
99
+ ds = CommentDataset(comments=df.comment_text.to_numpy(),
100
+ targets=df[labels].to_numpy(),
101
+ tokenizer=tokenizer,
102
+ max_len=max_len)
103
+
104
+ return DataLoader(ds, batch_size=batch_size)
105
+
106
+ #helper function to set seed
107
+ def set_seed(seed):
108
+ torch.manual_seed(seed)
109
+ torch.backends.cudnn.deterministic = True
110
+ torch.backends.cudnn.benchmark = False
111
+ np.random.seed(seed)
112
+
113
+ set_seed(SEED)
114
+
115
+ #gpu usage
116
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
117
+
118
+ config = DistilBertConfig.from_pretrained(PRE_TRAINED_MODEL_NAME)
119
+ config.num_labels = len(labels)
120
+ config.problem_type = "multi_label_classification"
121
+ config.classifier_dropout = 0.2
122
+ config.return_dict = True
123
+
124
+ model = DistilBertForSequenceClassification(config)
125
+ model = model.to(device)
126
+
127
+ train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
128
+ val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
129
+ test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
130
+
131
+ def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
132
+ """
133
+ hf = huggingface.
134
+ """
135
+ model.train()
136
+
137
+ for batch in tqdm(data_loader):
138
+ input_ids = batch["input_ids"].to(device)
139
+ attention_mask = batch["attention_mask"].to(device)
140
+ targets = batch["targets"].float().to(device)
141
+
142
+ optimizer.zero_grad()
143
+
144
+ with torch.set_grad_enabled(True):
145
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
146
+ loss = outputs.loss
147
+ loss.backward()
148
+ optimizer.step()
149
+
150
+ def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
151
+ model.eval()
152
+ losses = []
153
+ score = None
154
+
155
+ for idx, batch in enumerate(tqdm(data_loader)):
156
+ input_ids = batch["input_ids"].to(device)
157
+ attention_mask = batch["attention_mask"].to(device)
158
+ targets = batch["targets"].float().to(device)
159
+ with torch.set_grad_enabled(False):
160
+ outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
161
+ if idx == 0:
162
+ score = outputs.logits.cpu()
163
+ else:
164
+ score = torch.cat((score, outputs.logits.cpu()))
165
+ losses.append(outputs.loss.item())
166
+ return score, np.mean(losses)
167
+
168
+ optimizer = AdamW(model.parameters(), lr=2e-5)
169
+ best_val_loss = 9999.
170
+ print('====START TRAINING====')
171
+ #training here
172
+ for epoch in tqdm(range(EPOCHS)):
173
+ print('-' * 10)
174
+ train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
175
+ _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
176
+ val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
177
+ y_pred_np = val_pred.numpy()
178
+ val_auc = roc_auc_score(df_val[labels].to_numpy(), y_pred_np)
179
+ if val_loss < best_val_loss:
180
+ best_val_loss = val_loss
181
+ #torch.save(model.state_dict(), 'distill_bert.pt')
182
+ print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')
183
+ # once model is saved and generated no need to re run :)
184
+ #model = DistilBertForSequenceClassification(config)
185
+ #model.load_state_dict(torch.load('./distill_bert.pt'))
186
+ #model = model.to(device)
187
+ #test model here
188
+ test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
189
+ print('====TEST RESULT====')
190
+ print(f'Log loss: {test_loss:.5}')
191
+ y_pred_np = test_pred.numpy()
192
+ test_auc = roc_auc_score(df_test[labels].to_numpy(), y_pred_np)
193
+ print(f'ROC AUC: {test_auc:.5}')
194
+
195
+ test_src_id = test.iloc[:, 0]
196
+ test.drop(columns='id', inplace=True)
197
+ test_labels.drop(columns='id', inplace=True)
198
+ test_src = pd.concat((test, test_labels), axis=1)
199
+
200
+ test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
201
+ prediction, _ = evaluate_for_hf(model=model, data_loader=test_src_dataloader, device=device)
202
+ prediction = torch.sigmoid(prediction).numpy()
203
+
204
+ sub[labels] = prediction
205
+ sub.insert(1,"tweet",data.comment_text,True)
206
+ st.daatframe(sub)