File size: 3,042 Bytes
99dc8a3
ef4cddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9eee20c
ef4cddb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99dc8a3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import streamlit as st
import re
import torch
from transformers import AlbertTokenizer, AlbertModel
import pytorch_lightning as pl
from huggingface_hub import hf_hub_download


def download_torch_model():
    model_path = hf_hub_download(repo_id="adrianmoses/hate-speech-detection", filename="pytorch_hs_model.net")
    print(model_path)
    return model_path

def load_model():
    model = AlbertModel.from_pretrained("albert-base-v2")
    return model

def load_tokenizer():
    tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
    return tokenizer

def clean_tweet(tweet):
    return re.sub(r'@\w+:?', "", tweet, flags=re.IGNORECASE)


def tokenize(tweet):
    tweet = clean_tweet(tweet) 
    tokenizer = load_tokenizer()
    return tokenizer(tweet, padding=True, truncation=True, max_length=64, return_tensors='pt')



class HateSpeechClassifier(pl.LightningModule):

    def __init__(self, albert_model, dropout, hidden_dim, output_dim):
        super().__init__()
        self.model = albert_model
        self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.dropout = torch.nn.Dropout(dropout)
        self.l2 = torch.nn.Linear(hidden_dim, output_dim)
        self.loss = torch.nn.NLLLoss()

        

    def forward(self, input_ids, attention_mask, token_type_ids):
        x = self.model(input_ids, 
                               attention_mask=attention_mask, 
                               token_type_ids=token_type_ids)[0]
        x = x[:, 0]
        x = self.dropout(torch.relu(self.l1(x)))
        return torch.log_softmax(self.l2(x), dim=1)


    def training_step(self, batch, batch_idx):
        input_ids, attention_masks, token_type_ids, y = batch
        y_hat = self(input_ids, attention_masks, token_type_ids)
        loss = self.loss(y_hat, y.view(-1))
        return loss


    def validation_step(self, batch, batch_idx):
        input_ids, attention_masks, token_type_ids, y = batch
        y_hat = self(input_ids, attention_masks, token_type_ids)
        loss = self.loss(y_hat, y.view(-1))
        return loss


    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)

def setup_model():
  torch_model_path = download_torch_model()
  albert_model = load_model()
  model = HateSpeechClassifier(albert_model, 0.5, 768, 2)
  model.load_state_dict(torch.load(torch_model_path,  map_location=torch.device('cpu')))
  model.eval()
  return model


model = setup_model()

st.title("Hate Speech Detection")
st.caption("Text will be truncated to 64 tokens")

text = st.text_input("Enter text")

encoded_input = tokenize(text)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
input_ids = encoded_input['input_ids']
attention_mask = encoded_input['attention_mask']
token_type_ids = encoded_input['token_type_ids']

pred = model(input_ids, attention_mask, token_type_ids)
print(pred)
print(pred.data.max(1))
label = pred.data.max(1)[1]

print(label)
is_hate_speech = "YES" if label == 1 else "NO"

st.write(f"Is this hate speech?: {is_hate_speech}")