Spaces:

siddop
/

Attrition_Predictor

Sleeping

File size: 4,911 Bytes

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras import regularizers
import tensorflow as tf
import joblib
from nltk.tokenize import word_tokenize
import re
from lime.lime_tabular import LimeTabularExplainer
from keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder
import nltk
import gradio as gr
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# label encode object columns
df = pd.read_csv("Data.csv")
df2 = df.copy()

object_cols = df2.select_dtypes(include=['object']).columns
object_cols = object_cols.delete(object_cols.get_loc('Attrition'))
int_cols = df2.select_dtypes(exclude=['object']).columns

le_dict = {}
classes_dict = {}
for col in object_cols:
    le = LabelEncoder()
    df2[col] = le.fit_transform(df[col])
    le_dict[col] = le
    classes_dict[col] = le.classes_
    
X = df2.iloc[:, :-1]
y = df2.iloc[:, -1]

encoder = OneHotEncoder()
y2 = encoder.fit_transform(np.array(y).reshape(-1, 1))
y3 = pd.DataFrame(y2.toarray(), columns=['No', 'Yes'])


colList = []
for col in object_cols:
    colList.append(col)
for col in int_cols:
    colList.append(col)


scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y3, test_size=0.2, random_state=0)

# Load the model
loaded_model = tf.keras.models.load_model('Final_NN_model.keras')

# Create a LIME explainer
explainer = LimeTabularExplainer(training_data=X_scaled, class_names=[0, 1], mode="classification", feature_names=list(X.columns))

# Your machine learning model function
def predict_label(*args):
    if '' in args:
        return "Please fill in all inputs", pd.DataFrame([['awaiting inputs', 'awaiting inputs']], columns=["Feature", "Impact"])

    # Create empty dictionaries to hold the input data
    input_dict = {}
    input_df = {}

    # Map inputs and col names
    for i, col in enumerate(colList):
        input_dict[col] = args[i]

    # Rearrange columns as X df 
    for col in X.columns:
        input_df[col] = input_dict[col]
   
    # Add the input data to the DataFrame
    input_df = pd.DataFrame([input_df], columns=input_df.keys())

    # Encode labels of ibject columns
    for col in le_dict:
        input_df[col] = le_dict[col].transform(input_df[col])

    # Scale columns
    input_df = scaler.transform(input_df)
    
    # Load the pre-trained pipeline
    loaded_model = tf.keras.models.load_model('Final_NN_model.keras')

    # Make predictions
    predof0 = round(loaded_model.predict(input_df.reshape(1, -1))[0][0], 4)*100
    predof1 = round(loaded_model.predict(input_df.reshape(1, -1))[0][1], 4)*100
    
    # Explain the prediction
    exp = explainer.explain_instance(data_row=input_df[0], predict_fn=loaded_model.predict, num_features=19)

    # Create dictionary to store top 5 influencing features
    featimp = {}
    for i in range(19):
        for word in word_tokenize(exp.as_list()[i][0]):
            if re.findall(r'[a-zA-Z]+', word):
                feature = word
                weight = round(exp.as_list()[i][1], 2)
        if weight<=0:
            featimp[feature] = 'positive impact on retention'
        elif weight>0:
            featimp[feature] = 'negative impact on retention'

    # Convert dictionary to list of tuples for Gradio Table
    featimp_table = [(key, value) for key, value in featimp.items()]
    
    # Return prediction
    if predof0>=60:
        return f"Low probability ({predof1:.2f}%) of attrition", featimp_table
    elif predof0>=30:
        return f"Some probability ({predof1:.2f}%) of attrition", featimp_table
    else:
        return f"High probability ({predof1:.2f}%) of attrition", featimp_table

# Define the inputs with names and descriptions
obj_config = [gr.Dropdown(label=name, choices=sorted(classes_dict[name].tolist())) for name in object_cols]
int_config = [gr.Textbox(label=name, placeholder='enter a number') for name in int_cols]

# Concatenate the two sets of input configurations
input_config = obj_config + int_config

# Gradio Interface
iface = gr.Interface(
    title="Attrition Prediction",
    description = "Based on your inputs this model predicts if an employee in an organisation would resign or not.",
    allow_flagging='never',
    fn=predict_label, 
    inputs=input_config,
    outputs=[
        gr.Textbox(label="Prediction"),
        gr.DataFrame(headers=["Feature", "Impact"], label="All features and their impact on retention")
    ],
    live=False # Set live to True to see the interface while running the code
)

# Launch the Gradio interface
iface.launch(share=True)