import gradio as gr
import requests
import emoji
import re
import json
from thefuzz import process, fuzz
import numpy as np
import re
import nltk
from english_words import get_english_words_set


API_URL = "https://api-inference.huggingface.co/models/Dabid/abusive-tagalog-profanity-detection"
headers = {"Authorization": "Bearer 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"}

def query(text):
    payload = {"inputs": text}
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def read_text(filename, filetype='txt'):
    words = []

    if filetype == 'txt':
        with open(filename + '.txt') as file:
            words = [line.rstrip() for line in file]
            words = list(set(words))
    elif filetype == 'json':
        with open(filename + '.json') as json_file:
            words = json.load(json_file)
    
    return words


contractions = read_text('contractions', 'json')
similar_words = read_text('similar_words')
addon_words = read_text('addon_words')
profanities_dict = read_text('profanities', 'json')
lookup_profanity = np.concatenate([np.hstack(list(profanities_dict.values())), list(profanities_dict.keys())]).tolist()
lookup_words = list(set(similar_words).union(set(lookup_profanity)))
eng_words = list(get_english_words_set(['web2'], lower=True) - set(lookup_profanity))
punctuations = re.compile(r'^[^\w#@]+|[^\w#@]+$')

def fuzzy_lookup(tweet):

    matched_profanity = dict()

    for word in tweet.split():

        if word in eng_words:
            continue

        scores = []
        matched_words = []
        matched_word = None

        # Remove trailing punctuations except # and @
        word = punctuations.sub('', word).lower()

        # Save base word
        base_word =  word

        # Shortent elongated word
        word = re.sub(r'(.)\1{2,}', r'\1', word)

        # Remove # and @
        if word.startswith("#") or word.startswith("@"):
            word = word[1:]

        # Remove trailing words (mo, ka, pinaka)
        for addon in addon_words:
            if word.startswith(addon):
                word = word[len(addon):]
            if word.endswith(addon):
                word = word[:-len(addon)]

        if len(word) < 4:
            continue

        # Get fuzzy ratio
        for lookup_word in lookup_words:

            score = fuzz.ratio(word, lookup_word)

            # Threshold
            if score >= 70:
                scores.append(score)
                matched_words.append(lookup_word)

        if len(scores) == 0:
            continue
            
        if len(set(scores)) == 1:
            for matched_word in matched_words:
                if matched_word in lookup_profanity:
                    matched_word = matched_word
                    break
        else:
            # Get matched word with max score
            max_score_index = np.argmax(scores)
            matched_word = matched_words[max_score_index]

        if matched_word not in lookup_profanity:
            continue

        for base_profanity, profanity_variations in profanities_dict.items():

            if matched_word in profanity_variations or matched_word == base_profanity:

                # Seperate pronouns
                for addon in addon_words:
                    if base_word.endswith(addon):
                        base_profanity = base_profanity + " " + addon
                        break

                matched_profanity[base_word] = base_profanity
                break
                        
    return matched_profanity


def preprocess(tweet, profanities):
    
    tweet = tweet.lower()
    tweet = emoji.replace_emoji(tweet, replace='')

    # Replace profanities
    for base_word, matched_word in profanities.items():
        tweet = tweet.replace(base_word, matched_word)

    # Elongated words conversion
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet)

    row_split = tweet.split()

    for index, word in enumerate(row_split):

        # Remove links
        if 'http' in word:
            row_split[index] = ''

        # Unify laugh texts format to 'haha'
        laugh_texts = ['hahaha', 'wahaha', 'hahaa', 'ahha', 'haaha', 'hahah', 'ahah', 'hha']
        if any(x in word for x in laugh_texts):
            row_split[index] = 'haha'

    # Combine list of words back to sentence
    preprocessed_tweet = ' '.join(filter(None, row_split))

    if len(preprocessed_tweet.split()) == 1:
        return preprocessed_tweet

    # Expand Contractions
    for i in contractions.items():
        preprocessed_tweet = re.sub(rf"\b{i[0]}\b", i[1], preprocessed_tweet)

    return preprocessed_tweet


def predict(tweet):
    
    profanities = fuzzy_lookup(tweet)

    if len(profanities) > 0:

        preprocessed_tweet = preprocess(tweet, profanities)

        prediction = query(preprocessed_tweet)

        if type(prediction) == dict: 
            print(prediction)
            error_message = prediction['error']
            return error_message, {}

        prediction = prediction[0][0]["label"]
        
        print("\nTWEET:", tweet)
        print("PROCESSED TWEET:", preprocessed_tweet)
        print("DETECTED PROFANITY:", list(profanities.keys()))
        print("LABEL:", prediction, "\n")

        return prediction, list(profanities.keys())
    
    return "No Profanity", {}


demo = gr.Interface(
    fn=predict,

    inputs=[gr.components.Textbox(lines=5, placeholder='Enter your input here', label='INPUT')],

    outputs=[gr.components.Text(label="PREDICTION"), gr.JSON(label="PROFANITIES")],

    examples=['Tangina mo naman sobrang yabang mo gago!!😠😤 @davidrafael',
              'Napakainit ngayong araw pakshet namaaan!!',
              'Napakabagal naman ng wifi tangina #PLDC #HelloDITO',
              'Bobo ka ba? napakadali lang nyan eh... 🤡',
              'Uy gago laptrip yung nangyare samen kanina HAHAHA😂😂'],

    allow_flagging="never",

    title="Tagalog Profanity Classifier"
)

demo.launch(debug=True)