import streamlit as st import pandas as pd import numpy as np from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import seaborn as sns # App title st.title("🛍️ Customer Segmentation Tool") # 🎯 Streamlit Tabs tab1, tab2, tab3 = st.tabs(["📖 About", "📊 Dataset Overview", "🧑‍🤝‍🧑 Customer Segmentation"]) # About Tab with tab1: st.write(""" This app uses unsupervised learning techniques to segment customers based on their purchasing behavior. The dataset is preloaded and contains online retail data. ### How It Works: - **Step 1**: Load customer transaction data, including details like Quantity, UnitPrice, and CustomerID. - **Step 2**: Process the data by calculating the total spent and aggregating the information by customer. - **Step 3**: Apply **K-Means Clustering** to segment the customers into distinct groups. - **Step 4**: Visualize the customer segments with a scatter plot. """) # Load preloaded dataset file_path = "Online Retail.xlsx" df = pd.read_excel(file_path, sheet_name='Online Retail') # Dataset Overview Tab with tab2: st.write("### Dataset Overview") st.write(df.head()) # Preprocess data df = df.dropna(subset=["CustomerID"]) # Remove rows without CustomerID df["TotalSpent"] = pd.to_numeric(df["Quantity"], errors='coerce') * pd.to_numeric(df["UnitPrice"], errors='coerce') df = df.dropna(subset=["TotalSpent"]) # Aggregate data by Customer customer_data = df.groupby("CustomerID").agg({ "TotalSpent": "sum", "Quantity": "sum", "UnitPrice": "mean" }).rename(columns={"Quantity": "NumTransactions", "UnitPrice": "AvgUnitPrice"}) st.write("### Processed Customer Data") st.write(customer_data.head()) # Standardize the data scaler = StandardScaler() customer_scaled = pd.DataFrame(scaler.fit_transform(customer_data), columns=customer_data.columns, index=customer_data.index) # Elbow Method to determine optimal clusters st.write("### Elbow Method for Optimal Cluster Selection") distortions = [] K = range(1, 11) for k in K: kmeans = KMeans(n_clusters=k, random_state=42) kmeans.fit(customer_scaled) distortions.append(kmeans.inertia_) fig, ax = plt.subplots() ax.plot(K, distortions, marker='o') ax.set_xlabel("Number of Clusters") ax.set_ylabel("Distortion") ax.set_title("Elbow Method for Optimal k") st.pyplot(fig) # Customer Segmentation Tab with tab3: # User selects the number of clusters num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3) # Apply K-Means clustering model = KMeans(n_clusters=num_clusters, random_state=42) customer_data["Cluster"] = model.fit_predict(customer_scaled) # Visualize the clusters st.write("### Clusters Visualization") fig, ax = plt.subplots() scatter = ax.scatter(customer_data["TotalSpent"], customer_data["NumTransactions"], c=customer_data["Cluster"], cmap='viridis') ax.set_xlabel("Total Spent") ax.set_ylabel("Number of Transactions") ax.set_title("Customer Segments") plt.colorbar(scatter, label="Cluster") st.pyplot(fig) # Show the segmented customer data st.write("### Customer Segments Data") st.write(customer_data.groupby("Cluster").agg({"TotalSpent": "mean", "NumTransactions": "mean", "AvgUnitPrice": "mean"}))