Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
import torch | |
from torch import nn | |
from torch.utils.data import Dataset, DataLoader | |
from transformers import AutoTokenizer, BertModel | |
from sklearn import metrics | |
import streamlit as st | |
class ToxicityDataset(Dataset): | |
def __init__(self, dataframe, tokenizer, max_len): | |
self.tokenizer = tokenizer | |
self.data = dataframe | |
self.text = self.data.comment_text | |
self.targets = self.data.labels | |
self.max_len = max_len | |
def __len__(self): | |
return len(self.text) | |
def __getitem__(self, index): | |
text = str(self.text[index]) | |
text = " ".join(text.split()) | |
inputs = self.tokenizer.encode_plus( | |
text, | |
None, | |
add_special_tokens=True, | |
max_length=self.max_len, | |
padding="max_length", | |
truncation=True, | |
return_token_type_ids=True, | |
) | |
ids = inputs["input_ids"] | |
mask = inputs["attention_mask"] | |
token_type_ids = inputs["token_type_ids"] | |
return { | |
"ids": torch.tensor(ids, dtype=torch.long), | |
"mask": torch.tensor(mask, dtype=torch.long), | |
"token_type_ids": torch.tensor(token_type_ids, dtype=torch.long), | |
"targets": torch.tensor(self.targets[index], dtype=torch.float), | |
} | |
def inference(): | |
model.eval() | |
final_targets = [] | |
final_outputs = [] | |
with torch.no_grad(): | |
for _, data in enumerate(testing_loader, 0): | |
ids = data["ids"].to(device, dtype=torch.long) | |
mask = data["mask"].to(device, dtype=torch.long) | |
token_type_ids = data["token_type_ids"].to(device, dtype=torch.long) | |
targets = data["targets"].to(device, dtype=torch.float) | |
outputs = model(ids, mask, token_type_ids) | |
final_targets.extend(targets.cpu().detach().numpy().tolist()) | |
final_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) | |
return final_outputs, final_targets | |
bert_path = "bert-base-uncased" | |
bert_tokenizer = AutoTokenizer.from_pretrained(bert_path) | |
bert_model = BertForSequenceClassification.from_pretrained(bert_path, num_labels=6) | |
tuned_model = model = torch.load("pytorch_bert_toxic.bin") | |
tweets_raw = pd.read_csv("test.csv", nrows=20) | |
labels_raw = pd.read_csv("test_labels.csv", nrows=20) | |
label_set = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"] | |
MAX_LENGTH = 100 | |
TEST_BATCH_SIZE = 128 | |
test_dataset = ToxicityDataset(test_df, tokenizer, MAX_LENGTH) | |
test_params = {"batch_size": TEST_BATCH_SIZE, "shuffle": True, "num_workers": 0} | |
testing_loader = DataLoader(test_dataset, **test_params) | |
option = st.selectbox("Select a text analysis model:", ("BERT", "Fine-tuned BERT")) | |
if option == "BERT": | |
tokenizer = bert_tokenizer | |
model = bert_model | |
else: | |
tokenizer = bert_tokenizer | |
model = tuned_model | |
prediction, targets = inference() | |
prediction = np.array(prediction) >= 0.5 | |
targets = np.argmax(targets, axis=1) | |
prediction = np.argmax(prediction, axis=1) | |
accuracy = metrics.accuracy_score(targets, prediction) | |
f1_score_micro = metrics.f1_score(targets, prediction, average="micro") | |
f1_score_macro = metrics.f1_score(targets, prediction, average="macro") | |
print(f"Accuracy Score = {accuracy}") | |
print(f"F1 Score (Micro) = {f1_score_micro}") | |
print(f"F1 Score (Macro) = {f1_score_macro}") | |
# Write results | |
st.write("Classification Probabilities") | |
st.write(f"{neutralProb:.4f} - NEUTRAL") | |
st.write(f"{toxicProb:.4f} - TOXIC") | |