File size: 4,911 Bytes
847b911
 
 
 
 
 
 
 
 
 
 
2c90708
847b911
 
2c90708
 
10a1388
1534dd0
06190b0
 
847b911
 
ceb9838
847b911
 
 
 
 
 
2c90708
 
847b911
 
 
 
2c90708
847b911
 
 
 
2c90708
 
 
 
847b911
 
 
 
 
 
 
 
 
 
 
 
2c90708
847b911
 
 
 
108c635
2c90708
847b911
 
 
 
2c90708
847b911
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2c90708
 
 
847b911
2c90708
847b911
 
2c90708
 
 
847b911
 
2c90708
 
 
 
 
847b911
 
2c90708
847b911
 
2c90708
 
 
 
847b911
2c90708
847b911
 
 
 
 
 
 
 
 
 
 
2c90708
847b911
 
 
 
 
272185d
847b911
 
 
 
 
f8f7b57
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras import regularizers
import tensorflow as tf
import joblib
from nltk.tokenize import word_tokenize
import re
from lime.lime_tabular import LimeTabularExplainer
from keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder
import nltk
import gradio as gr
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# label encode object columns
df = pd.read_csv("Data.csv")
df2 = df.copy()

object_cols = df2.select_dtypes(include=['object']).columns
object_cols = object_cols.delete(object_cols.get_loc('Attrition'))
int_cols = df2.select_dtypes(exclude=['object']).columns

le_dict = {}
classes_dict = {}
for col in object_cols:
    le = LabelEncoder()
    df2[col] = le.fit_transform(df[col])
    le_dict[col] = le
    classes_dict[col] = le.classes_
    
X = df2.iloc[:, :-1]
y = df2.iloc[:, -1]

encoder = OneHotEncoder()
y2 = encoder.fit_transform(np.array(y).reshape(-1, 1))
y3 = pd.DataFrame(y2.toarray(), columns=['No', 'Yes'])


colList = []
for col in object_cols:
    colList.append(col)
for col in int_cols:
    colList.append(col)


scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y3, test_size=0.2, random_state=0)

# Load the model
loaded_model = tf.keras.models.load_model('Final_NN_model.keras')

# Create a LIME explainer
explainer = LimeTabularExplainer(training_data=X_scaled, class_names=[0, 1], mode="classification", feature_names=list(X.columns))

# Your machine learning model function
def predict_label(*args):
    if '' in args:
        return "Please fill in all inputs", pd.DataFrame([['awaiting inputs', 'awaiting inputs']], columns=["Feature", "Impact"])

    # Create empty dictionaries to hold the input data
    input_dict = {}
    input_df = {}

    # Map inputs and col names
    for i, col in enumerate(colList):
        input_dict[col] = args[i]

    # Rearrange columns as X df 
    for col in X.columns:
        input_df[col] = input_dict[col]
   
    # Add the input data to the DataFrame
    input_df = pd.DataFrame([input_df], columns=input_df.keys())

    # Encode labels of ibject columns
    for col in le_dict:
        input_df[col] = le_dict[col].transform(input_df[col])

    # Scale columns
    input_df = scaler.transform(input_df)
    
    # Load the pre-trained pipeline
    loaded_model = tf.keras.models.load_model('Final_NN_model.keras')

    # Make predictions
    predof0 = round(loaded_model.predict(input_df.reshape(1, -1))[0][0], 4)*100
    predof1 = round(loaded_model.predict(input_df.reshape(1, -1))[0][1], 4)*100
    
    # Explain the prediction
    exp = explainer.explain_instance(data_row=input_df[0], predict_fn=loaded_model.predict, num_features=19)

    # Create dictionary to store top 5 influencing features
    featimp = {}
    for i in range(19):
        for word in word_tokenize(exp.as_list()[i][0]):
            if re.findall(r'[a-zA-Z]+', word):
                feature = word
                weight = round(exp.as_list()[i][1], 2)
        if weight<=0:
            featimp[feature] = 'positive impact on retention'
        elif weight>0:
            featimp[feature] = 'negative impact on retention'

    # Convert dictionary to list of tuples for Gradio Table
    featimp_table = [(key, value) for key, value in featimp.items()]
    
    # Return prediction
    if predof0>=60:
        return f"Low probability ({predof1:.2f}%) of attrition", featimp_table
    elif predof0>=30:
        return f"Some probability ({predof1:.2f}%) of attrition", featimp_table
    else:
        return f"High probability ({predof1:.2f}%) of attrition", featimp_table

# Define the inputs with names and descriptions
obj_config = [gr.Dropdown(label=name, choices=sorted(classes_dict[name].tolist())) for name in object_cols]
int_config = [gr.Textbox(label=name, placeholder='enter a number') for name in int_cols]

# Concatenate the two sets of input configurations
input_config = obj_config + int_config

# Gradio Interface
iface = gr.Interface(
    title="Attrition Prediction",
    description = "Based on your inputs this model predicts if an employee in an organisation would resign or not.",
    allow_flagging='never',
    fn=predict_label, 
    inputs=input_config,
    outputs=[
        gr.Textbox(label="Prediction"),
        gr.DataFrame(headers=["Feature", "Impact"], label="All features and their impact on retention")
    ],
    live=False # Set live to True to see the interface while running the code
)

# Launch the Gradio interface
iface.launch(share=True)