|
import streamlit as st |
|
import pandas as pd |
|
import seaborn as sns |
|
import numpy as np |
|
import pickle |
|
import matplotlib.pyplot as plt |
|
from data_preparation import preprocess_data,data_imp |
|
from clustering import perform_clustering, plot_clusters,summarize_cluster_characteristics |
|
from feature_selection import select_features_pca, select_features_rfe, select_features_rf |
|
from sklearn.preprocessing import StandardScaler |
|
|
|
insurance_feature_descriptions,bankng_feature_descriptions,retail_feature_descriptions,insurance_defaults,banking_defaults,retail_defaults=data_imp() |
|
|
|
def load_data(dataset_choice): |
|
if dataset_choice == "Insurance": |
|
data = pd.read_sas('a2z_insurance.sas7bdat',encoding='latin1') |
|
elif dataset_choice == "Retail": |
|
data = pd.read_csv('retaildata.csv', encoding='latin1') |
|
elif dataset_choice == "Banking": |
|
data = pd.read_csv('bankingdata.csv', encoding='latin1') |
|
return data |
|
|
|
|
|
def display_business_understanding(): |
|
st.subheader("Business Objective") |
|
st.write(""" |
|
###### Customer segmentation is a fundamental task in marketing and customer relationship management. With the advancements in data analytics and machine learning, it is now possible to group customers into distinct segments with a high degree of precision, allowing businesses to tailor their marketing strategies and offerings to each segment's unique needs and preferences. |
|
|
|
###### Through this customer segmentation, businesses can achieve: |
|
- **Personalization**: Tailoring marketing strategies to meet the unique needs of each segment. |
|
- **Optimization**: Efficient allocation of marketing resources. |
|
- **Insight**: Gaining a deeper understanding of the customer base. |
|
- **Engagement**: Enhancing customer engagement and satisfaction. |
|
|
|
###### => Problem/Requirement: Utilize machine learning and data analysis techniques in Python to perform customer segmentation. |
|
|
|
""") |
|
st.image("Customer-Segmentation.png", caption="Customer Segmentation", use_column_width=True) |
|
|
|
|
|
def display_dataset_selection(): |
|
dataset_choice = st.selectbox("Select Dataset", ("Insurance", "Retail", "Banking")) |
|
data = load_data(dataset_choice) |
|
st.write(f"Dataset: {dataset_choice}") |
|
st.write("Number of rows:", data.shape[0]) |
|
st.write("Number of columns:", data.shape[1]) |
|
st.write("First five rows of the data:") |
|
st.write(data.head()) |
|
if dataset_choice=="Insurance": |
|
st.write(insurance_feature_descriptions) |
|
elif dataset_choice=="Retail": |
|
st.write(retail_feature_descriptions) |
|
else: |
|
st.write(bankng_feature_descriptions) |
|
return data |
|
|
|
|
|
def display_modeling_evaluation(): |
|
dataset_choice = st.selectbox("Select Dataset", ("Insurance", "Retail", "Banking")) |
|
data = load_data(dataset_choice) |
|
data = preprocess_data(data) |
|
|
|
|
|
st.sidebar.header("Feature Selection and Clustering Method") |
|
feature_selection_method = st.sidebar.selectbox("Select feature selection method", ('PCA', 'RFE', 'Random Forest')) |
|
n_clusters = st.sidebar.slider("Number of clusters", min_value=2, max_value=10, value=3) |
|
|
|
if feature_selection_method == 'PCA': |
|
n_components = st.sidebar.slider("Number of PCA components", min_value=2, max_value=10, value=5) |
|
elif feature_selection_method in ['RFE', 'Random Forest']: |
|
n_features_to_select = st.sidebar.slider("Number of features to select", min_value=2, max_value=10, value=5) |
|
|
|
|
|
if st.sidebar.button("Cluster"): |
|
if feature_selection_method == 'PCA': |
|
selected_data, selected_features = select_features_pca(data, n_components) |
|
elif feature_selection_method == 'RFE': |
|
selected_data, selected_features = select_features_rfe(data, n_features_to_select) |
|
elif feature_selection_method == 'Random Forest': |
|
selected_data, selected_features = select_features_rf(data, n_features_to_select) |
|
|
|
st.write(f"Selected Features: {selected_features}") |
|
clustered_data, score, df_value_scaled, labels, model = perform_clustering(selected_data, n_clusters) |
|
st.write(f"Number of Clusters: {n_clusters}") |
|
st.write(f"Silhouette Score: {score}") |
|
st.write("Clustered Data") |
|
st.write(clustered_data) |
|
st.write("Cluster Visualization") |
|
plot_clusters(df_value_scaled, labels) |
|
|
|
|
|
st.session_state.selected_features = selected_features |
|
st.session_state.model = model |
|
st.session_state.clustered_data = clustered_data |
|
st.session_state.labels = labels |
|
st.session_state.df_value_scaled = df_value_scaled |
|
|
|
|
|
if 'selected_features' in st.session_state and 'model' in st.session_state: |
|
st.write("### Predict Cluster") |
|
|
|
|
|
with st.form(key='prediction_form'): |
|
user_input = {} |
|
for feature in st.session_state.selected_features: |
|
|
|
|
|
if dataset_choice == "Insurance": |
|
default_value = insurance_defaults.get(feature, 0.0) |
|
elif dataset_choice == "Banking": |
|
default_value = banking_defaults.get(feature, 0.0) |
|
elif dataset_choice == "Retail": |
|
default_value = retail_defaults.get(feature, 0.0) |
|
else: |
|
default_value = 0.0 |
|
|
|
user_input[feature] = st.number_input(f'Enter {feature}', value=default_value) |
|
|
|
submit_button = st.form_submit_button(label='Predict') |
|
|
|
if submit_button: |
|
user_df = pd.DataFrame(user_input, index=[0]) |
|
|
|
scaler = StandardScaler() |
|
user_df_scaled = scaler.fit_transform(user_df) |
|
|
|
cluster = st.session_state.model.predict(user_df_scaled) |
|
st.write(f'The predicted cluster for the input data is: {cluster[0]}') |
|
|
|
|
|
clustered_data = st.session_state.clustered_data |
|
labels = st.session_state.labels |
|
df_value_scaled = st.session_state.df_value_scaled |
|
|
|
|
|
summary = summarize_cluster_characteristics(clustered_data, labels, cluster[0]) |
|
|
|
|
|
inference = f"Based on the input features, the customer belongs to Cluster {cluster[0]}, which is characterized by the following average values:\n" |
|
for feature, value in summary.items(): |
|
inference += f"- {feature}: {value:.2f}\n" |
|
st.write(inference) |
|
|
|
plot_clusters(df_value_scaled, labels, new_data_point=user_df_scaled) |
|
|
|
|
|
|
|
def main(): |
|
st.title("Customer Segmentation Demo") |
|
st.header("Customer Segmentation") |
|
|
|
|
|
menu = ["Business Understanding", "Dataset", "Modeling & Prediction"] |
|
choice = st.sidebar.selectbox('Menu', menu) |
|
|
|
if choice == 'Business Understanding': |
|
display_business_understanding() |
|
|
|
elif choice == 'Dataset': |
|
display_dataset_selection() |
|
|
|
elif choice == 'Modeling & Prediction': |
|
display_modeling_evaluation() |
|
|
|
if __name__ == "__main__": |
|
main() |
|
|