Spaces:
Runtime error
Runtime error
thotran
commited on
Commit
•
65931d3
1
Parent(s):
a8995a8
requirements fix
Browse files
app.py
CHANGED
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import torch
|
4 |
+
import torch.nn as nn
|
5 |
+
import torch.optim as optim
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from torch.utils.data import TensorDataset, DataLoader, Dataset
|
8 |
+
from sklearn.metrics import roc_auc_score
|
9 |
+
import re
|
10 |
+
from tqdm.notebook import tqdm
|
11 |
+
from typing import *
|
12 |
+
import string
|
13 |
+
from sklearn.model_selection import train_test_split
|
14 |
+
from transformers import DistilBertTokenizer, AdamW
|
15 |
+
from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification
|
16 |
+
import streamlit as st
|
17 |
+
st.write("Please be patient model training takes 20+ mins :P")
|
18 |
+
#config constants
|
19 |
+
SEED = 42
|
20 |
+
EPOCHS = 2
|
21 |
+
SEQ_SIZE = 150
|
22 |
+
BATCH_SIZE = 32
|
23 |
+
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"
|
24 |
+
|
25 |
+
#import all data
|
26 |
+
data=pd.read_csv('./data/train.csv',engine='python',encoding='utf-8', error_bad_lines=False)
|
27 |
+
test=pd.read_csv('./data/test.csv',engine='python',encoding='utf-8', error_bad_lines=False)
|
28 |
+
test_labels=pd.read_csv('./data/test_labels.csv',engine='python',encoding='utf-8', error_bad_lines=False)
|
29 |
+
sub=pd.read_csv('./data/sample_submission.csv',engine='python',encoding='utf-8', error_bad_lines=False)
|
30 |
+
|
31 |
+
#setup data
|
32 |
+
data.drop(columns='id',inplace=True)
|
33 |
+
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
|
34 |
+
|
35 |
+
#text proccessing
|
36 |
+
def cleanString(comment: str) -> str:
|
37 |
+
#contrationcs
|
38 |
+
comment = re.sub('n\'t', ' not', comment)
|
39 |
+
comment = re.sub('\'m', ' am', comment)
|
40 |
+
comment = re.sub('\'ve', ' have', comment)
|
41 |
+
comment = re.sub('\'s', ' is', comment)
|
42 |
+
#newline
|
43 |
+
comment = comment.replace('\n', ' \n ')
|
44 |
+
comment = comment.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')
|
45 |
+
comment = comment.replace(r'[0-9]', '')
|
46 |
+
comment = re.sub('[^a-zA-Z%]', ' ', comment)
|
47 |
+
comment = re.sub('%', '', comment)
|
48 |
+
comment = re.sub(r' +', ' ', comment)
|
49 |
+
comment = re.sub(r'\n', ' ', comment)
|
50 |
+
comment = re.sub(r' +', ' ', comment)
|
51 |
+
comment = comment.strip()
|
52 |
+
return comment
|
53 |
+
|
54 |
+
data.comment_text=data.comment_text.map(cleanString)
|
55 |
+
|
56 |
+
#tokenizer
|
57 |
+
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
|
58 |
+
|
59 |
+
token_lens = []
|
60 |
+
|
61 |
+
for txt in tqdm(data.comment_text):
|
62 |
+
tokens = tokenizer.encode(txt, max_length=512)
|
63 |
+
token_lens.append(len(tokens))
|
64 |
+
|
65 |
+
#test train split
|
66 |
+
df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
|
67 |
+
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
|
68 |
+
#set pytorch dataset
|
69 |
+
class CommentDataset(Dataset):
|
70 |
+
def __init__(self, comments, targets, tokenizer, max_len):
|
71 |
+
assert len(comments) == len(targets)
|
72 |
+
self.comments = comments
|
73 |
+
self.targets = targets
|
74 |
+
self.tokenizer = tokenizer
|
75 |
+
self.max_len = max_len
|
76 |
+
|
77 |
+
def __len__(self):
|
78 |
+
return len(self.comments)
|
79 |
+
|
80 |
+
def __getitem__(self, item):
|
81 |
+
comment = str(self.comments[item])
|
82 |
+
target = self.targets[item]
|
83 |
+
|
84 |
+
encoding = self.tokenizer.encode_plus(comment,
|
85 |
+
add_special_tokens=True,
|
86 |
+
max_length=self.max_len,
|
87 |
+
return_token_type_ids=False,
|
88 |
+
pad_to_max_length=True,
|
89 |
+
# padding='max_length',
|
90 |
+
return_attention_mask=True,
|
91 |
+
return_tensors='pt',
|
92 |
+
)
|
93 |
+
return {'review_text': comment,
|
94 |
+
'input_ids': encoding['input_ids'].flatten(),
|
95 |
+
'attention_mask': encoding['attention_mask'].flatten(),
|
96 |
+
'targets': torch.tensor(target, dtype=torch.long)}
|
97 |
+
|
98 |
+
def create_data_loader(df: pd.DataFrame, tokenizer, max_len: int, batch_size: int):
|
99 |
+
ds = CommentDataset(comments=df.comment_text.to_numpy(),
|
100 |
+
targets=df[labels].to_numpy(),
|
101 |
+
tokenizer=tokenizer,
|
102 |
+
max_len=max_len)
|
103 |
+
|
104 |
+
return DataLoader(ds, batch_size=batch_size)
|
105 |
+
|
106 |
+
#helper function to set seed
|
107 |
+
def set_seed(seed):
|
108 |
+
torch.manual_seed(seed)
|
109 |
+
torch.backends.cudnn.deterministic = True
|
110 |
+
torch.backends.cudnn.benchmark = False
|
111 |
+
np.random.seed(seed)
|
112 |
+
|
113 |
+
set_seed(SEED)
|
114 |
+
|
115 |
+
#gpu usage
|
116 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
117 |
+
|
118 |
+
config = DistilBertConfig.from_pretrained(PRE_TRAINED_MODEL_NAME)
|
119 |
+
config.num_labels = len(labels)
|
120 |
+
config.problem_type = "multi_label_classification"
|
121 |
+
config.classifier_dropout = 0.2
|
122 |
+
config.return_dict = True
|
123 |
+
|
124 |
+
model = DistilBertForSequenceClassification(config)
|
125 |
+
model = model.to(device)
|
126 |
+
|
127 |
+
train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
|
128 |
+
val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
|
129 |
+
test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
|
130 |
+
|
131 |
+
def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
|
132 |
+
"""
|
133 |
+
hf = huggingface.
|
134 |
+
"""
|
135 |
+
model.train()
|
136 |
+
|
137 |
+
for batch in tqdm(data_loader):
|
138 |
+
input_ids = batch["input_ids"].to(device)
|
139 |
+
attention_mask = batch["attention_mask"].to(device)
|
140 |
+
targets = batch["targets"].float().to(device)
|
141 |
+
|
142 |
+
optimizer.zero_grad()
|
143 |
+
|
144 |
+
with torch.set_grad_enabled(True):
|
145 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
|
146 |
+
loss = outputs.loss
|
147 |
+
loss.backward()
|
148 |
+
optimizer.step()
|
149 |
+
|
150 |
+
def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
|
151 |
+
model.eval()
|
152 |
+
losses = []
|
153 |
+
score = None
|
154 |
+
|
155 |
+
for idx, batch in enumerate(tqdm(data_loader)):
|
156 |
+
input_ids = batch["input_ids"].to(device)
|
157 |
+
attention_mask = batch["attention_mask"].to(device)
|
158 |
+
targets = batch["targets"].float().to(device)
|
159 |
+
with torch.set_grad_enabled(False):
|
160 |
+
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
|
161 |
+
if idx == 0:
|
162 |
+
score = outputs.logits.cpu()
|
163 |
+
else:
|
164 |
+
score = torch.cat((score, outputs.logits.cpu()))
|
165 |
+
losses.append(outputs.loss.item())
|
166 |
+
return score, np.mean(losses)
|
167 |
+
|
168 |
+
optimizer = AdamW(model.parameters(), lr=2e-5)
|
169 |
+
best_val_loss = 9999.
|
170 |
+
print('====START TRAINING====')
|
171 |
+
#training here
|
172 |
+
for epoch in tqdm(range(EPOCHS)):
|
173 |
+
print('-' * 10)
|
174 |
+
train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
|
175 |
+
_, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
|
176 |
+
val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
|
177 |
+
y_pred_np = val_pred.numpy()
|
178 |
+
val_auc = roc_auc_score(df_val[labels].to_numpy(), y_pred_np)
|
179 |
+
if val_loss < best_val_loss:
|
180 |
+
best_val_loss = val_loss
|
181 |
+
#torch.save(model.state_dict(), 'distill_bert.pt')
|
182 |
+
print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')
|
183 |
+
# once model is saved and generated no need to re run :)
|
184 |
+
#model = DistilBertForSequenceClassification(config)
|
185 |
+
#model.load_state_dict(torch.load('./distill_bert.pt'))
|
186 |
+
#model = model.to(device)
|
187 |
+
#test model here
|
188 |
+
test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
|
189 |
+
print('====TEST RESULT====')
|
190 |
+
print(f'Log loss: {test_loss:.5}')
|
191 |
+
y_pred_np = test_pred.numpy()
|
192 |
+
test_auc = roc_auc_score(df_test[labels].to_numpy(), y_pred_np)
|
193 |
+
print(f'ROC AUC: {test_auc:.5}')
|
194 |
+
|
195 |
+
test_src_id = test.iloc[:, 0]
|
196 |
+
test.drop(columns='id', inplace=True)
|
197 |
+
test_labels.drop(columns='id', inplace=True)
|
198 |
+
test_src = pd.concat((test, test_labels), axis=1)
|
199 |
+
|
200 |
+
test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
|
201 |
+
prediction, _ = evaluate_for_hf(model=model, data_loader=test_src_dataloader, device=device)
|
202 |
+
prediction = torch.sigmoid(prediction).numpy()
|
203 |
+
|
204 |
+
sub[labels] = prediction
|
205 |
+
sub.insert(1,"tweet",data.comment_text,True)
|
206 |
+
st.daatframe(sub)
|