import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras import regularizers
import tensorflow as tf
import joblib
import re
from lime.lime_tabular import LimeTabularExplainer
import gradio as gr
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# label encode object columns
le_dict = {}
df = pd.read_csv("Data.csv")
df2 = df.copy()

object_cols = df2.select_dtypes(include=['object']).columns
object_cols = object_cols.delete(object_cols.get_loc('Attrition'))
int_cols = df2.select_dtypes(exclude=['object']).columns


for col in object_cols:
    le = LabelEncoder()
    df2[col] = le.fit_transform(df[col])
    le_dict[col] = le
    
X = df2.iloc[:, :-1]
y = df2.iloc[:, -1]


colList = []
for col in object_cols:
    colList.append(col)
for col in int_cols:
    colList.append(col)

classes_dict = {}
for col in object_cols:
    le_col = LabelEncoder()
    df2[col] = le_col.fit_transform(df[col])
    classes_dict[col] = le_col.classes_


scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=0)

# Load the model
loaded_model = tf.keras.models.load_model('Final_NN_model.keras')

# Create a LIME explainer
explainer = LimeTabularExplainer(X_scaled, mode="classification", feature_names=X.columns)

# Your machine learning model function
def predict_label(*args):
    if '' in args:
        return "Please fill in all inputs", pd.DataFrame([['awaiting inputs', 'awaiting inputs']], columns=["Feature", "Importance"])

    # Create empty dictionaries to hold the input data
    input_dict = {}
    input_df = {}

    # Map inputs and col names
    for i, col in enumerate(colList):
        input_dict[col] = args[i]

    # Rearrange columns as X df 
    for col in X.columns:
        input_df[col] = input_dict[col]
   
    # Add the input data to the DataFrame
    input_df = pd.DataFrame([input_df], columns=input_df.keys())

    # Encode labels of ibject columns
    for col in le_dict:
        input_df[col] = le_dict[col].transform(input_df[col])

    # Scale columns
    input_df = scaler.transform(input_df)
    
    # Load the pre-trained pipeline
    loaded_model = tf.keras.models.load_model('Final_NN_model.keras')

    # Make predictions
    pred = round(loaded_model.predict(input_df.reshape(1, -1))[0][0], 4)*100

    # Explain the prediction
    exp = explainer.explain_instance(input_df[0], loaded_model.predict, labels=(0, ), num_features=len(X.columns))

    # Create dictionary to store top 5 influencing features
    top5 = {}
    for i in range(5):
        for word in word_tokenize(exp.as_list(0)[i][0]):
            if re.findall(r'[a-zA-Z]+', word):
                feature = word
                weight = round(exp.as_list(0)[i][1], 2)
        top5[feature] = weight

    # Convert dictionary to list of tuples for Gradio Table
    top5_table = [(key, value) for key, value in top5.items()]
    # top5_table = pd.DataFrame(top5_table, columns=["Feature", "Importance"])
    
    # Return prediction
    if pred<=30:
        return f"Low probability ({pred:.2f}%) of attrition", top5_table
    elif pred<=70:
        return f"Some probability ({pred:.2f}%) of attrition", top5_table
    else:
        return f"High probability ({pred:.2f}%) of attrition", top5_table

# Define the inputs with names and descriptions
obj_config = [gr.Dropdown(label=name, choices=sorted(classes_dict[name].tolist())) for name in object_cols]
int_config = [gr.Textbox(label=name, placeholder='enter a number') for name in int_cols]

# Concatenate the two sets of input configurations
input_config = obj_config + int_config

# Gradio Interface
iface = gr.Interface(
    title="Attrition Prediction",
    description = "This app predicts if an employee in your organisation would resign or not. The values shown under top features shows influence of each feature on the prediction. A higher number indicates that the feature is more influential in determining the prediction, while a lower number indicates less influence.",
    allow_flagging='never',
    fn=predict_label, 
    inputs=input_config,
    outputs=[
        gr.Textbox(label="Prediction"),
        gr.DataFrame(headers=["Feature", "Importance"], label="Top 5 featured influencing prediction")
    ],
    live=False # Set live to True to see the interface while running the code
)

# Launch the Gradio interface
iface.launch(share=True)