simran0608's picture
Update app.py
efb850b verified
raw
history blame
7.69 kB
import streamlit as st
import pandas as pd
import seaborn as sns
import numpy as np
import pickle
import matplotlib.pyplot as plt
from data_preparation import preprocess_data,data_imp
from clustering import perform_clustering, plot_clusters,summarize_cluster_characteristics
from feature_selection import select_features_pca, select_features_rfe, select_features_rf
from sklearn.preprocessing import StandardScaler
insurance_feature_descriptions,bankng_feature_descriptions,retail_feature_descriptions,insurance_defaults,banking_defaults,retail_defaults=data_imp()
def load_data(dataset_choice):
if dataset_choice == "Insurance":
data = pd.read_sas('a2z_insurance.sas7bdat',encoding='latin1')
elif dataset_choice == "Retail":
data = pd.read_csv('retaildata.csv', encoding='latin1')
elif dataset_choice == "Banking":
data = pd.read_csv('bankingdata.csv', encoding='latin1')
return data
# Function to display Business Understanding section
def display_business_understanding():
st.subheader("Business Objective")
st.write("""
###### Customer segmentation is a fundamental task in marketing and customer relationship management. With the advancements in data analytics and machine learning, it is now possible to group customers into distinct segments with a high degree of precision, allowing businesses to tailor their marketing strategies and offerings to each segment's unique needs and preferences.
###### Through this customer segmentation, businesses can achieve:
- **Personalization**: Tailoring marketing strategies to meet the unique needs of each segment.
- **Optimization**: Efficient allocation of marketing resources.
- **Insight**: Gaining a deeper understanding of the customer base.
- **Engagement**: Enhancing customer engagement and satisfaction.
###### => Problem/Requirement: Utilize machine learning and data analysis techniques in Python to perform customer segmentation.
""")
st.image("Customer-Segmentation.png", caption="Customer Segmentation", use_column_width=True)
# Function to display Dataset section
def display_dataset_selection():
dataset_choice = st.selectbox("Select Dataset", ("Insurance", "Retail", "Banking"))
data = load_data(dataset_choice)
st.write(f"Dataset: {dataset_choice}")
st.write("Number of rows:", data.shape[0])
st.write("Number of columns:", data.shape[1])
st.write("First five rows of the data:")
st.write(data.head())
if dataset_choice=="Insurance":
st.write(feature_descriptions)
elif dataset_choice=="Retail":
st.write(retail_feature_descriptions)
else:
st.write(bankng_feature_descriptions)
return data
# Function to display Modeling & Evaluation section
def display_modeling_evaluation():
dataset_choice = st.selectbox("Select Dataset", ("Insurance", "Retail", "Banking"))
data = load_data(dataset_choice)
data = preprocess_data(data)
# Sidebar for feature selection and clustering method
st.sidebar.header("Feature Selection and Clustering Method")
feature_selection_method = st.sidebar.selectbox("Select feature selection method", ('PCA', 'RFE', 'Random Forest'))
n_clusters = st.sidebar.slider("Number of clusters", min_value=2, max_value=10, value=3)
if feature_selection_method == 'PCA':
n_components = st.sidebar.slider("Number of PCA components", min_value=2, max_value=10, value=5)
elif feature_selection_method in ['RFE', 'Random Forest']:
n_features_to_select = st.sidebar.slider("Number of features to select", min_value=2, max_value=10, value=5)
# Perform clustering on button click
if st.sidebar.button("Cluster"):
if feature_selection_method == 'PCA':
selected_data, selected_features = select_features_pca(data, n_components)
elif feature_selection_method == 'RFE':
selected_data, selected_features = select_features_rfe(data, n_features_to_select)
elif feature_selection_method == 'Random Forest':
selected_data, selected_features = select_features_rf(data, n_features_to_select)
st.write(f"Selected Features: {selected_features}")
clustered_data, score, df_value_scaled, labels, model = perform_clustering(selected_data, n_clusters)
st.write(f"Number of Clusters: {n_clusters}")
st.write(f"Silhouette Score: {score}")
st.write("Clustered Data")
st.write(clustered_data)
st.write("Cluster Visualization")
plot_clusters(df_value_scaled, labels)
# Store selected features and model in session state
st.session_state.selected_features = selected_features
st.session_state.model = model
st.session_state.clustered_data = clustered_data
st.session_state.labels = labels
st.session_state.df_value_scaled = df_value_scaled
# Predict new data based on selected features
if 'selected_features' in st.session_state and 'model' in st.session_state:
st.write("### Predict Cluster")
# Use st.form to handle input fields
with st.form(key='prediction_form'):
user_input = {}
for feature in st.session_state.selected_features:
# Set default values based on the dataset choice
if dataset_choice == "Insurance":
default_value = insurance_defaults.get(feature, 0.0)
elif dataset_choice == "Banking":
default_value = banking_defaults.get(feature, 0.0)
elif dataset_choice == "Retail":
default_value = retail_defaults.get(feature, 0.0)
else:
default_value = 0.0
user_input[feature] = st.number_input(f'Enter {feature}', value=default_value)
submit_button = st.form_submit_button(label='Predict')
if submit_button:
user_df = pd.DataFrame(user_input, index=[0])
scaler = StandardScaler()
user_df_scaled = scaler.fit_transform(user_df)
cluster = st.session_state.model.predict(user_df_scaled)
st.write(f'The predicted cluster for the input data is: {cluster[0]}')
# Get the clustered data and labels from session state
clustered_data = st.session_state.clustered_data
labels = st.session_state.labels
df_value_scaled = st.session_state.df_value_scaled
# Summarize cluster characteristics
summary = summarize_cluster_characteristics(clustered_data, labels, cluster[0])
# Generate and display the inference
inference = f"Based on the input features, the customer belongs to Cluster {cluster[0]}, which is characterized by the following average values:\n"
for feature, value in summary.items():
inference += f"- {feature}: {value:.2f}\n"
st.write(inference)
plot_clusters(df_value_scaled, labels, new_data_point=user_df_scaled)
# Main app structure
def main():
st.title("Customer Segmentation Demo")
st.header("Customer Segmentation")
# Sidebar menu options
menu = ["Business Understanding", "Dataset", "Modeling & Prediction"]
choice = st.sidebar.selectbox('Menu', menu)
if choice == 'Business Understanding':
display_business_understanding()
elif choice == 'Dataset':
display_dataset_selection()
elif choice == 'Modeling & Prediction':
display_modeling_evaluation()
if __name__ == "__main__":
main()