File size: 8,844 Bytes
65931d3
 
 
 
 
 
 
 
 
9993f32
65931d3
 
 
 
 
9993f32
65931d3
9993f32
e92903c
65931d3
 
 
 
 
 
 
 
6423daf
 
 
 
65931d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e92903c
 
 
 
65931d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e92903c
65931d3
 
 
 
 
 
9993f32
65931d3
 
 
 
 
 
 
 
 
 
 
e92903c
65931d3
 
 
 
 
9993f32
65931d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e92903c
9993f32
 
 
 
 
 
 
 
 
65931d3
9993f32
65931d3
9993f32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e92903c
6423daf
e92903c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset
from sklearn.metrics import roc_auc_score
import re
from stqdm import stqdm
from typing import *
import string
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, AdamW
from transformers import DistilBertModel, DistilBertConfig, DistilBertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import streamlit as st

st.markdown("### Welcome to toxicity! A showcase for the TweetBert Model!")
#config constants
SEED = 42
EPOCHS = 2
SEQ_SIZE = 150
BATCH_SIZE = 32
PRE_TRAINED_MODEL_NAME = "distilbert-base-uncased"

#import all data
data=pd.read_csv('./data/train.csv',engine='python',encoding='utf-8')
test=pd.read_csv('./data/test.csv',engine='python',encoding='utf-8')
test_labels=pd.read_csv('./data/test_labels.csv',engine='python',encoding='utf-8')
sub=pd.read_csv('./data/sample_submission.csv',engine='python',encoding='utf-8')

#setup data
data.drop(columns='id',inplace=True)
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#text proccessing
def cleanString(comment: str) -> str:
    #contrationcs
    comment = re.sub('n\'t', ' not', comment) 
    comment = re.sub('\'m', ' am', comment)
    comment = re.sub('\'ve', ' have', comment)
    comment = re.sub('\'s', ' is', comment)
    #newline
    comment = comment.replace('\n', ' \n ')
    comment = comment.replace(r'([*!?\'])\1\1{2,}',r'\1\1\1')    
    comment = comment.replace(r'[0-9]', '') 
    comment = re.sub('[^a-zA-Z%]', ' ', comment)
    comment = re.sub('%', '', comment)
    comment = re.sub(r' +', ' ', comment)
    comment = re.sub(r'\n', ' ', comment)
    comment = re.sub(r' +', ' ', comment)
    comment = comment.strip()
    return comment

data.comment_text=data.comment_text.map(cleanString)

#tokenizer
tokenizer = DistilBertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
token_lens = []

#for txt in stqdm(data.comment_text,desc="tokenizing"):
#  tokens = tokenizer.encode(txt, max_length=512)
#  token_lens.append(len(tokens))
#^code above commented for HF runtime purposes, tokenizes comment_text for the bert model
#test train split
df_train, df_test = train_test_split(data, test_size=0.15, random_state=SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=SEED)
#set pytorch dataset
class CommentDataset(Dataset):
    def __init__(self, comments, targets, tokenizer, max_len):
        assert len(comments) == len(targets)
        self.comments = comments
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, item):
        comment = str(self.comments[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(comment,
                                              add_special_tokens=True,
                                              max_length=self.max_len,
                                              return_token_type_ids=False,
                                              pad_to_max_length=True,
                                            #   padding='max_length',
                                              return_attention_mask=True,
                                              return_tensors='pt',
                                             )
        return {'review_text': comment,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'targets': torch.tensor(target, dtype=torch.long)}

def create_data_loader(df: pd.DataFrame, tokenizer, max_len: int, batch_size: int):
    ds = CommentDataset(comments=df.comment_text.to_numpy(),
                        targets=df[labels].to_numpy(),
                        tokenizer=tokenizer,
                        max_len=max_len)

    return DataLoader(ds, batch_size=batch_size)

#helper function to set seed
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)

set_seed(SEED)

#gpu usage
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

config = DistilBertConfig.from_pretrained(PRE_TRAINED_MODEL_NAME)
config.num_labels = len(labels)
config.problem_type = "multi_label_classification"
config.classifier_dropout = 0.2
config.return_dict = True

model = DistilBertForSequenceClassification(config)
model = model.to(device)

train_dataloader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=BATCH_SIZE)
val_dataloader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
test_dataloader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
# training function
def train_epoch_for_hf(model, data_loader: DataLoader, device: torch.device, optimizer):
    """
    hf = huggingface.
    """
    model.train()

    for batch in stqdm(data_loader, desc="training"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].float().to(device)
        
        optimizer.zero_grad()

        with torch.set_grad_enabled(True):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
#evalute and keep best model
def evaluate_for_hf(model, data_loader: DataLoader, device: torch.device):
    model.eval()
    losses = []
    score = None

    for idx, batch in enumerate(stqdm(data_loader,desc="evaluating")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        targets = batch["targets"].float().to(device)
        with torch.set_grad_enabled(False):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=targets)
            if idx == 0:
                score =  outputs.logits.cpu()
            else:
                score = torch.cat((score, outputs.logits.cpu()))
            losses.append(outputs.loss.item())
    return score, np.mean(losses)

optimizer = AdamW(model.parameters(), lr=2e-5)
best_val_loss = 9999.
print('====START TRAINING====')
# actuual training here
#for epoch in stqdm(range(EPOCHS)):
#     print('-' * 10)
#     train_epoch_for_hf(model=model, data_loader=train_dataloader, optimizer=optimizer, device=device)
#     _, tr_loss = evaluate_for_hf(model=model, data_loader=train_dataloader, device=device)
#     val_pred, val_loss = evaluate_for_hf(model=model, data_loader=val_dataloader, device=device)
#     y_pred_np = val_pred.numpy()
#     val_auc = roc_auc_score(df_val[labels].to_numpy(), y_pred_np)
#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
         #torch.save(model.state_dict(), 'distill_bert.pt')
#     print(f'Epoch {epoch + 1}/{EPOCHS}', f'train loss: {tr_loss:.4},', f'val loss: {val_loss:.4},', f'val auc: {val_auc:.4}')
# once model is saved and generated no need to re run :)

#PUSH MODEL TO HF
#from huggingface_hub import notebook_login
#notebook_login()
#model.push_to_hub("tweetbert")
#tokenizer.push_to_hub("tweetbert")

#LOAD MODEL
model=model = AutoModelForSequenceClassification.from_pretrained("thotranexe/tweetbert")
model = model.to(device)

#TEST MODEL
#test_pred, test_loss = evaluate_for_hf(model=model, data_loader=test_dataloader, device=device)
#print('====TEST RESULT====')
#print(f'Log loss: {test_loss:.5}')
#y_pred_np = test_pred.numpy()
#test_auc = roc_auc_score(df_test[labels].to_numpy(), y_pred_np)
#print(f'ROC AUC: {test_auc:.5}')

#test_src_id = test.iloc[:, 0]
#test.drop(columns='id', inplace=True)
#test_labels.drop(columns='id', inplace=True)
#test_src = pd.concat((test, test_labels), axis=1)

#MAKE PREDICTIONS
#test_src_dataloader = create_data_loader(df=test_src, tokenizer=tokenizer, max_len=SEQ_SIZE, batch_size=1)
#prediction, _ = evaluate_for_hf(model=model, data_loader=test_src_dataloader, device=device)
#prediction = torch.sigmoid(prediction).numpy()

#SAVE RESULTS INTO SUBMISSION DATAFRAME
#sub[labels] = prediction
#sub.insert(1,"tweet",data.comment_text,True)
#sub.to_csv("sub.csv", encoding='utf-8', index=False)
#^commented above code, saved to csv to reduce wait/comput time on HF
sub=pd.read_csv('./data/sub.csv',engine='python',encoding='utf-8')
sub.drop(columns="id")
st.dataframe(sub)
st.write("here is a table of the tweets and the likelihood of each label :) loaded from a csv out of respect for your time")