Spaces:

Thushalya
/

AiLERT

Sleeping

File size: 17,638 Bytes

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,AutoModel
import re
from textblob import TextBlob
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import emoji 
import string
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import textstat
import pandas as pd
from transformers import pipeline
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import os
from dotenv import load_dotenv
import pandas as pd
load_dotenv()

    





#Loading author details
def average_word_length(tweet):
    words = tweet.split()
    return sum(len(word) for word in words) / len(words)


def lexical_diversity(tweet):
    words = tweet.split()
    unique_words = set(words)
    return len(unique_words) / len(words)

def count_capital_letters(tweet):
    return sum(1 for char in tweet if char.isupper())

def count_words_surrounded_by_colons(tweet):
    # Define a regular expression pattern to match words surrounded by ':'
    pattern = r':(\w+):'

    # Use re.findall to find all matches in the tweet
    matches = re.findall(pattern, tweet)

    # Return the count of matched words
    return len(matches)

def count_emojis(tweet):
    # Convert emoji symbols to their corresponding names
    tweet_with_names = emoji.demojize(tweet)
    return count_words_surrounded_by_colons(tweet_with_names)

def hashtag_frequency(tweet):
    hashtags = re.findall(r'#\w+', tweet)
    return len(hashtags)

def mention_frequency(tweet):
    mentions = re.findall(r'@\w+', tweet)
    return len(mentions)

def count_special_characters(tweet):
    special_characters = [char for char in tweet if char in string.punctuation]
    return len(special_characters)


def stop_word_frequency(tweet):
    stop_words = set(stopwords.words('english'))
    words = [word for word in tweet.split() if word.lower() in stop_words]
    return len(words)

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_linguistic_features(tweet):
    # Tokenize the tweet
    words = word_tokenize(tweet)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Get parts of speech tags
    pos_tags = pos_tag(filtered_words)

    # Count various linguistic features
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N'))
    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V'))
    participle_count = sum(1 for word, pos in pos_tags if pos.startswith('V') and ('ing' in word or 'ed' in word))
    interjection_count = sum(1 for word, pos in pos_tags if pos == 'UH')
    pronoun_count = sum(1 for word, pos in pos_tags if pos.startswith('PRP'))
    preposition_count = sum(1 for word, pos in pos_tags if pos.startswith('IN'))
    adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB'))
    conjunction_count = sum(1 for word, pos in pos_tags if pos.startswith('CC'))

    return {
        'Noun_Count': noun_count,
        'Verb_Count': verb_count,
        'Participle_Count': participle_count,
        'Interjection_Count': interjection_count,
        'Pronoun_Count': pronoun_count,
        'Preposition_Count': preposition_count,
        'Adverb_Count': adverb_count,
        'Conjunction_Count': conjunction_count
    }

def readability_score(tweet):
    return textstat.flesch_reading_ease(tweet)

def get_url_frequency(tweet):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet)
    return len(urls)


# Define a function to extract features from a single tweet
def extract_features(tweet):
    features = {
        'Average_Word_Length': average_word_length(tweet),
        # 'Average_Sentence_Length': average_sentence_length(tweet),
        'Lexical_Diversity': lexical_diversity(tweet),
        'Capital_Letters_Count': count_capital_letters(tweet),  # Uncomment if you want to include this feature
        'Hashtag_Frequency': hashtag_frequency(tweet),
        'Mention_Frequency': mention_frequency(tweet),
        'count_emojis': count_emojis(tweet),
        'special_chars_count': count_special_characters(tweet),
        'Stop_Word_Frequency': stop_word_frequency(tweet),
        **get_linguistic_features(tweet),  # Include linguistic features
        'Readability_Score': readability_score(tweet),
        'URL_Frequency': get_url_frequency(tweet)  # Assuming you have the correct function for this
    }
    return features

# # Extract features for all tweets
# features_list = [extract_features(tweet) for tweet in X['text']]

# # Create a Pandas DataFrame
# X_new = pd.DataFrame(features_list)



# Loading personality model

def personality_detection(text, threshold=0.05, endpoint= 1.0):
    PERSONALITY_TOKEN =os.environ.get('PERSONALITY_TOKEN', None)
    print(PERSONALITY_TOKEN)
    tokenizer = AutoTokenizer.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN)
    model = AutoModelForSequenceClassification.from_pretrained ("Nasserelsaman/microsoft-finetuned-personality",token=PERSONALITY_TOKEN)

    with torch.no_grad():
        inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")
        outputs = model(**inputs)
        predictions = outputs.logits.squeeze().detach().numpy()
        
        # Get raw logits
        logits = model(**inputs).logits
        
        # Apply sigmoid to squash between 0 and 1
        probabilities = torch.sigmoid(logits)
    
    # # Set values less than the threshold to 0.05
    # predictions[predictions < threshold] = 0.05
    # predictions[predictions > endpoint] = 1.0
    # print("per",probabilities[0][0].detach().numpy())
    # print("per",probabilities[0][1].detach().numpy())
    # print("per",probabilities[0][2].detach().numpy())
    # print("per",probabilities[0][3].detach().numpy())
    # print("per",probabilities[0][4].detach().numpy())
    
    # label_names = ['Agreeableness', 'Conscientiousness', 'Extraversion', 'Neuroticism', 'Openness']
    # # result = {label_names[i]: f"{predictions[i]*100:.0f}%" for i in range(len(label_names))}
    # result = {label_names[i]: f"{probabilities}%" for i in range(len(label_names))}
    # probabilities
    print(probabilities)
    return [probabilities[0][0].detach().numpy()
            ,probabilities[0][1].detach().numpy()
            ,probabilities[0][2].detach().numpy()
            ,probabilities[0][3].detach().numpy()
            ,probabilities[0][4].detach().numpy()]


# tokenizer = AutoTokenizer.from_pretrained("Nasserelsaman/microsoft-finetuned-personality")
# model = AutoModelForSequenceClassification.from_pretrained("Nasserelsaman/microsoft-finetuned-personality")

#Loading emotion model

# tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")
# model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion-multilabel-latest")

##use this for gpu
# pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True,device=device )

##use this for cpu
def calc_emotion_score(tweet):
    pipe = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-emotion-multilabel-latest", return_all_scores=True )
    emotions = pipe(tweet)[0]
    for i in emotions:
        print(i)

    return [emotions[0]['score'],emotions[1]['score'],emotions[2]['score'],emotions[3]['score'],emotions[4]['score'],emotions[5]['score'],emotions[6]['score'],emotions[7]['score'],emotions[8]['score'],emotions[9]['score'],emotions[10]['score']]
    





#DCL model launching

def load_model(tweet):
    # model = torch.load("./authormodel.pt",map_location ='cpu') 
    # print(model)

    model_name = "vinai/bertweet-base"
    PADDING_MAX_LENGTH = 45
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer(tweet, truncation=True, padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True, return_tensors="pt")
    print(inputs)
    emotion_list = calc_emotion_score(tweet)
    print(emotion_list)
    preemotion_list = emotion_list[:]

    features_list = extract_features(tweet)
    for i in features_list.values():
        emotion_list.append(i)
    print("emotion + author",emotion_list)
    # print()
    # print(features_list)
    personality_list = personality_detection(tweet)
    print("personality",personality_list)
    # person_list = [personality_list["Extraversion"],personality_list['Neuroticism'],personality_list['Agreeableness'],personality_list['Conscientiousness'],personality_list['Openness']]
    emotion_list.extend(personality_list)
    print("final list",emotion_list)
    # print(str(features_list["Average_Word_Length"]))
    inputs['emotion_author_vector'] =  torch.tensor([emotion_list])

    print("final inputs    ",inputs)
    
    
    # []
    # inputs["emotion_author_vector"] = 
    # train_dataloader=DataLoader(inputs, batch_size=1 , shuffle=False)
    # print(train_dataloader)
    device = torch.device("cuda:0"  if torch.cuda.is_available() else "cpu")
    # def tokenize_function(examples):
    #     return tokenizer.batch_encode_plus(examples["text"], padding='max_length',max_length=PADDING_MAX_LENGTH,add_special_tokens=True,truncation=True)
    class EmotionAuthorGuidedDCLModel(nn.Module):
        def __init__(self,dcl_model:nn.Module,dropout:float=0.5):
            super(EmotionAuthorGuidedDCLModel, self).__init__()
            self.dcl_model = dcl_model
            self.dim = 802
            self.dropout = nn.Dropout(dropout)
            self.linear = nn.Linear(self.dim, 1)
            # Freeze all layers
            for param in self.dcl_model.parameters():
                param.requires_grad = False

        def forward(self,batch_tokenized):
            input_ids = batch_tokenized['input_ids']
            attention_mask = batch_tokenized['attention_mask']
            emotion_vector = batch_tokenized['emotion_author_vector']
            bert_output = self.dcl_model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
            bert_cls_hidden_state = bert_output[1]
            combined_vector =torch.cat((bert_cls_hidden_state,emotion_vector), 1)
            d_combined_vector=self.dropout(combined_vector)
            linear_output = self.linear(d_combined_vector)
            pred_linear = linear_output.squeeze(1)
            return pred_linear
    # twee
   
    checkpoint = {
        "model_state_dict":torch.load("./model.pt",map_location ='cpu') ,
    }
     
    # checkpoint=load_checkpoint(run=run_dcl_study,check_point_name="model_checkpoints/")
    
    class DCLArchitecture(nn.Module):
        def __init__(self,dropout:float,bert_model_name:str='vinai/bertweet-base'):
            super(DCLArchitecture, self).__init__()
            self.bert = AutoModel.from_pretrained(bert_model_name)
            self.dim = 768
            self.dense = nn.Linear(self.dim, 1)
            self.dropout = nn.Dropout(dropout)

        def forward(self,batch_tokenized, if_train=False):
            input_ids = batch_tokenized['input_ids']
            attention_mask = batch_tokenized['attention_mask']
            bert_output = self.bert(input_ids, attention_mask=attention_mask, output_hidden_states=True)
            bert_cls_hidden_state = bert_output[1]
            torch.cuda.empty_cache()

            if if_train:
                bert_cls_hidden_state_aug = self.dropout(bert_cls_hidden_state)
                bert_cls_hidden_state = torch.cat((bert_cls_hidden_state, bert_cls_hidden_state_aug), dim=1).reshape(-1, self.dim)
            else:
                bert_cls_hidden_state = self.dropout(bert_cls_hidden_state)

            linear_output = self.dense(bert_cls_hidden_state)
            linear_output = linear_output.squeeze(1)

            return bert_cls_hidden_state, linear_output
    

    # dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=best_prams["DROPOUT"])
    dcl_model = DCLArchitecture(bert_model_name=model_name,dropout=0.5)
    dcl_model.to(device)
    
    DROPOUT = 0.5
    fined_tuned_bert_model=dcl_model.bert
    model = EmotionAuthorGuidedDCLModel(dcl_model=fined_tuned_bert_model,dropout=DROPOUT)
    model.to(device)
    model.load_state_dict(checkpoint["model_state_dict"])
    



    # def test_loop(model, test_dataloader, device):
    # # collection_metric = MetricCollection(
    # #       BinaryAccuracy(),
    # #       MulticlassPrecision(num_classes=2,average=average),
    # #       MulticlassRecall(num_classes=2,average=average),
    # #       MulticlassF1Score(num_classes=2,average=average),
    # #       BinaryConfusionMatrix()
    # # )
    # # collection_metric.to(device)
    #     model.eval()
    #     print(test_dataloader)
    #     # total_test_loss = 0.0
    #     for batch in test_dataloader:
    #         print(batch)
    #         batch = {k: v.to(device) for k, v in batch.items()}
    #         # labels = batch["labels"]
    #         with torch.no_grad():
    #             pred = model(batch)
    #             # loss = criteon(pred, labels.float())
    #             pred = torch.round(torch.sigmoid(pred))
        
    #     return pred
    # result_metrics=test_loop(model=model, test_dataloader=train_dataloader,device=device)
    # print("Hate speech result",result_metrics)

    def predict_single_text(model, inputs,device):
        # Preprocess the text
        # inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Pass the preprocessed text through the model
        with torch.no_grad():
            model.eval()
            pred = model(inputs)
            print("prediction ",pred)
            print("sigmoid output",torch.sigmoid(pred))
            pred = torch.sigmoid(pred)
            # Assuming your model returns a single value for prediction
            
        
        return pred

    predicted_class = predict_single_text(model, inputs, device)
    return predicted_class,preemotion_list,personality_list
    # print("Hate speech result",predicted_class)




#Gradio interface
simple  = None
def greet(tweet):
    print("start")
    prediction,preemotion_list,personality_list = load_model(tweet)
    preemotion_list = [x * 100 for x in preemotion_list]
    simple = pd.DataFrame(
    {
        "Emotions": ["Anger", "Anticipation", "Disgust", "Fear", "Joy", "Love", "Optimism", "Pessimism", "Sadness","Surprise","Trust"],
        "Values": preemotion_list,
    }
    )
    personality_values = pd.DataFrame(
        {
            "Personality": ['Agreeableness', 'Conscientiousness', 'Extraversion', 'Neuroticism', 'Openness'],
            "Values": personality_list,
        }
    )

    # with gr.Blocks() as bar_plot:
    #     bar_plot.load(outputs= gr.BarPlot(
    #             simple,
    #             x="Emotions",
    #             y="Values",
    #             title="Simple Bar Plot with made up data",
    #             tooltip=["a", "b"],
    #             y_lim=[20, 100],
    #         ))

    # bar_plot.launch()

    prediction_value = round(prediction.item(),2)
    # features_list = extract_features(tweet)
    # print(personality_detection(tweet))
    # print(str(features_list["Average_Word_Length"]))
    # print(calc_emotion_score(tweet))
    predicted_class = torch.round(prediction).item()
    print(preemotion_list)
    print(personality_list)
    print("end")
    if (predicted_class==0.0):
        label = "Non Hate"
    else:
        label = "Hate"
    

    return label,str(prediction_value)+"%",str(1-prediction_value)+"%",simple,personality_values

# demo = gr.Interface(fn=greet, inputs="text", outputs="text")
demo = gr.Interface(
    title = "Unmasking Hate: An Integrated Approach to Detecting Hate Speech in Social Media",
    # fn=greet,
    fn=greet, inputs=gr.Textbox(placeholder="Enter an input sentence...",label="Input Sentence"), 
    allow_flagging = "never",outputs=[
        gr.Label(label="Label"),
        gr.Textbox(label="Hate Speech Percentage"),
        gr.Textbox(label="Non Hate Speech Percentage"),
        gr.BarPlot(
                simple,
                x="Emotions",
                y="Values",
                title="Emotion Analysis",
                tooltip=["Emotions", "Values"],
                y_lim=[0, 1],
                label="Emotion bar graph"
            ),
            gr.BarPlot(
                personality_values,
                x="Personality",
                y="Values",
                title="Personality Analysis",
                tooltip=["Personality", "Values"],
                y_lim=[0, 1],
                label="Personality bar graph"
            )
    ],
    examples=[
        ["I like you"],
        ["I hate you"],
        ["I can't stand those asian always causing trouble. They need to go back to where they came from!"],
        ["Just saw a Sunni preaching on the street. Why don't they go worship in their own country instead of invading ours?"],
        ["I wish all bisexuals would just disappear. Sick of their agenda being shoved in our faces"],
        ["Had a great time celebrating diversity at the multicultural festival today!"],
        ["Congratulations to Sri Lankans for their cultural contributions to our society"],
        ["Love is love, no matter who you are or who you love"] ]
        )

demo.launch()