import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import json
import string
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import gradio as gr 
import joblib 
import nltk

nltk.download('stopwords')
nltk.download('punkt')

# Load the trained model
model = joblib.load('model.bin')

def remove_punctuation(text):
    punctuation_free = "".join([i for i in text if i not in string.punctuation])
    return punctuation_free
    
def vectorize_text(texts):
    vectorizer = CountVectorizer()
    vectorizer.fit(texts)
    text_vectorized = vectorizer.transform(texts)
    return text_vectorized, vectorizer

def test_model(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = remove_punctuation(text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word not in stop_words]

    # Join the filtered tokens back into a string
    preprocessed_text = ' '.join(filtered_text)
    
    # Vectorize the preprocessed text
    vectorize_texts = vectorize_text([preprocessed_text])
   
    # Make prediction on the vectorized text
    prediction = model.predict(vectorize_texts[0])[0]

    # Return the prediction
    return prediction

# Create the Gradio interface
iface = gr.Interface(fn=test_model, inputs="text", outputs="text", title="Text Classification")
iface.launch()