Spaces:

fnavales
/

hate-speech

Runtime error

File size: 4,743 Bytes

import gradio as gr
import torch.nn as nn
import torch
from transformers import BertTokenizerFast as BertTokenizer, BertModel
import pytorch_lightning as pl


BERT_MODEL_NAME = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
LABEL_COLUMNS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


class ToxicCommentTagger(pl.LightningModule):

    def __init__(self, n_classes: int, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.bert = BertModel.from_pretrained(BERT_MODEL_NAME, return_dict=True)
        self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCELoss()


def predict(model, tokenizer, sentence):

    encoding = tokenizer.encode_plus(
        sentence,
        add_special_tokens=False,
        max_length=510,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors='pt'
    )

    # define target chunksize
    chunksize = 512

    # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
    input_id_chunks = list(encoding['input_ids'][0].split(chunksize - 2))
    mask_chunks = list(encoding['attention_mask'][0].split(chunksize - 2))

    # loop through each chunk
    for i in range(len(input_id_chunks)):
        # add CLS and SEP tokens to input IDs
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([
            torch.tensor([1]), mask_chunks[i], torch.tensor([1])
        ])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_len)
            ])
            mask_chunks[i] = torch.cat([
                mask_chunks[i], torch.Tensor([0] * pad_len)
            ])

    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)

    input_dict = {
        'input_ids': input_ids.long(),
        'attention_mask': attention_mask.int()
    }

    _, test_prediction = model(**input_dict)
    test_prediction = test_prediction.numpy()

    output = {}
    for chunk in test_prediction:
        for label, prediction in zip(LABEL_COLUMNS, chunk):
            if label in output:
                output[label] = max(prediction, output[label])
            else:
                output[label] = prediction

    return output


model = ToxicCommentTagger.load_from_checkpoint(
    './best-checkpoint.ckpt',
    n_classes=len(LABEL_COLUMNS)
)

model.eval()
model.freeze()


all_categories = {'all_categories': [
                                    'toxicity',
                                    'obscene',
                                    'threat',
                                    'insult',
                                    'identity_attack',
                                    'sexual_explicit'
                                    ]
                  }


examples = [
    ['Yes indeed. She sort of reminds me of the elder lady that played the part in the movie "Titanic" who was telling her story!!! And I wouldn\'t have wanted to cover who I really am!! I would be proud!!!! WE should be proud of our race no matter what it is!!',
  50],
 ['The trans women reading this tweet right now is beautiful', 50],
 ["Question: These 4 broads who criticize America, what country did they flee to get here? And now they want to make OUR America like THEIR former HELL HOLE. I don't think so!!!!!!!!!!  Let them explain their GRATITUDE for letting them in OUR country.",
  50],
 ['Well being subtle you could not have been born because white people don\'t matter',
  50],
 ["For starters bend over the one in pink and kick that ass and pussy to get a taste until she's begging for a dick inside her.",
  50]
]


def toxicity(sentence, threshold):
    predicts = predict(model, tokenizer, sentence)
    return [ x for x in predicts if predicts[x] > threshold/100 ], all_categories

gr.Interface(fn=toxicity, 
             inputs=[
                 gr.Textbox(placeholder="Enter sentence here..."),
                 gr.Slider(0, 100)
             ], 
             outputs=[
                 'text',
                 gr.JSON(all_categories)
             ],
             examples=examples).launch()