File size: 3,233 Bytes
a7b2523
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pickle
import pandas as pd
import streamlit as st
from models import  KMeans
from models import  KMedoids
import matplotlib.pyplot as plt
from models import EnsembleClustering 
from sklearn.decomposition import PCA

st.title("Customer Purchase Behavior")

if "form_submitted" not in st.session_state:
    st.session_state["form_submitted"] = False

# Reading csv file with clustering
@st.cache_data
def read_csv():
    df = pd.read_csv("dataset/clustered_dataset.csv")
    return df

# Loading the model
@st.cache_resource
def load_model():
    model = pickle.load(open("model/clustering.pkl", 'rb'))
    return model

# Prediction of cluster on the given feature
@st.cache_data
def predict(_model, features):
    label = _model.predict(features)
    return label.item()

# Form submission
def form_submission():
    st.session_state["form_submitted"] = True

# Data transformation
def transform_data():
    df = read_csv().drop(['Cluster'], axis=1)
    input_data = [st.session_state["Revenue_given"], st.session_state["Frequency"], st.session_state["Recency"], 0 if st.session_state["uk"] == "No" else 1]
    df.loc[len(df)] = input_data
    pca = PCA(n_components = 2, random_state=42)
    features = pca.fit_transform(df)
    return features

# Features clustering centroid points
@st.cache_data
def clusterPointsMean(df):
    clusters = df['Cluster']
    df = df.drop(['Cluster'], axis=1)
    pca = PCA(n_components = 2, random_state=42)
    features = pca.fit_transform(df)

    x1 = [features[i][0] for i in range(len(features))]
    x2 = [features[i][1] for i in range(len(features))]

    cluster = clusters
    data = {
        "X1": x1,
        "X2": x2,
        "Cluster": cluster
    }
    data = pd.DataFrame(data)
    return data

# Display plot
@st.cache_data
def displayPoints(data, feature):
    c1_x, c1_y = data[data['Cluster'] == 0].drop(['Cluster'], axis=1).mean()
    c2_x, c2_y = data[data['Cluster'] == 1].drop(['Cluster'], axis=1).mean()

    fig, ax = plt.subplots()
    ax.scatter(c1_x, c1_y, color="red", label="Cluster-1", s = 150)
    ax.scatter(c2_x, c2_y, color="blue", label="Cluster-2", s = 150)
    ax.scatter(feature[0], feature[1], color="green", label="Data point", s = 150)
    ax.legend()
    st.pyplot(fig=fig)

def main():
    form = st.form(border=False, key="data_form")

    with form:
        c1, c2 = st.columns(2)

        c1.number_input(label="Revenue Contribution", key="Revenue_given")
        c1.number_input(label="Days past since last buy", min_value=0, value=0, key="Recency")

        c2.number_input(label="How Frequently bought", min_value=0, value=0, key="Frequency")
        c2.selectbox(label="From United Kingdom?", options=["Yes", "No"], index=0, key="uk")

        st.form_submit_button(label="Cluster Customer", type='primary', on_click=form_submission)

    if st.session_state["form_submitted"]:

        features = transform_data()

        feature = features[len(features)-1]

        model = load_model()

        st.markdown(f"## The customer falls within Cluster - {predict(model, feature.reshape(1, 2))+1}")

        df = read_csv()

        data = clusterPointsMean(df=df)

        displayPoints(data, feature)

if __name__ == "__main__":
    main()