import streamlit as st
import re
import torch
from transformers import AlbertTokenizer, AlbertModel
import pytorch_lightning as pl
from huggingface_hub import hf_hub_download
def download_torch_model():
model_path = hf_hub_download(repo_id="adrianmoses/hate-speech-detection", filename="")
return model_path
def load_model():
model = AlbertModel.from_pretrained("albert-base-v2")
return model
def load_tokenizer():
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
return tokenizer
def clean_tweet(tweet):
return re.sub(r'@\w+:?', "", tweet, flags=re.IGNORECASE)
def tokenize(tweet):
tweet = clean_tweet(tweet)
tokenizer = load_tokenizer()
return tokenizer(tweet, padding=True, truncation=True, max_length=64, return_tensors='pt')
class HateSpeechClassifier(pl.LightningModule):
def __init__(self, albert_model, dropout, hidden_dim, output_dim):
self.model = albert_model
self.l1 = torch.nn.Linear(hidden_dim, hidden_dim)
self.dropout = torch.nn.Dropout(dropout)
self.l2 = torch.nn.Linear(hidden_dim, output_dim)
self.loss = torch.nn.NLLLoss()
def forward(self, input_ids, attention_mask, token_type_ids):
x = self.model(input_ids,
x = x[:, 0]
x = self.dropout(torch.relu(self.l1(x)))
return torch.log_softmax(self.l2(x), dim=1)
def training_step(self, batch, batch_idx):
input_ids, attention_masks, token_type_ids, y = batch
y_hat = self(input_ids, attention_masks, token_type_ids)
loss = self.loss(y_hat, y.view(-1))
return loss
def validation_step(self, batch, batch_idx):
input_ids, attention_masks, token_type_ids, y = batch
y_hat = self(input_ids, attention_masks, token_type_ids)
loss = self.loss(y_hat, y.view(-1))
return loss
def configure_optimizers(self):
return torch.optim.Adam(self.parameters(), lr=1e-5)
def setup_model():
torch_model_path = download_torch_model()
albert_model = load_model()
model = HateSpeechClassifier(albert_model, 0.5, 768, 2)
model.load_state_dict(torch.load(torch_model_path, map_location=torch.device('cpu')))
return model
model = setup_model()
st.title("Hate Speech Detection")
st.caption("Text will be truncated to 64 tokens")
text = st.text_input("Enter text")
encoded_input = tokenize(text)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
input_ids = encoded_input['input_ids']
attention_mask = encoded_input['attention_mask']
token_type_ids = encoded_input['token_type_ids']
pred = model(input_ids, attention_mask, token_type_ids)
label =[1]
is_hate_speech = "YES" if label == 1 else "NO"
st.write(f"Is this hate speech?: {is_hate_speech}")