Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler, MinMaxScaler | |
from sklearn.decomposition import PCA | |
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN | |
from sklearn.metrics import silhouette_score | |
import joblib | |
import matplotlib.pyplot as plt | |
from statsmodels.stats.outliers_influence import variance_inflation_factor | |
# Function to calculate VIF and filter features with VIF < 10 | |
def calculate_vif(df): | |
vif_data = pd.DataFrame() | |
vif_data['feature'] = df.columns | |
vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])] | |
return vif_data[vif_data['VIF'] < 10]['feature'].tolist() | |
# Function to load and process data (including VIF and PCA) | |
def process_data(file, scaler_option): | |
df = pd.read_csv(file) | |
# Select only numeric columns for VIF calculation | |
df_numeric = df.select_dtypes(include=[np.number]) | |
# Handle missing values by filling them with the mean of the respective columns | |
df_numeric = df_numeric.fillna(df_numeric.mean()) | |
# Calculate VIF and filter features with VIF < 10 | |
selected_features = calculate_vif(df_numeric) | |
if not selected_features: | |
st.error("No features with VIF < 10 found. Please review the data.") | |
return None, None | |
df_filtered = df_numeric[selected_features] | |
# Apply chosen scaler | |
if scaler_option == 'StandardScaler': | |
scaler = StandardScaler() | |
elif scaler_option == 'MinMaxScaler': | |
scaler = MinMaxScaler() | |
scaled_data = scaler.fit_transform(df_filtered) | |
# PCA Transformation (2 components for visualization) | |
pca = PCA(n_components=2) | |
pca_data = pca.fit_transform(scaled_data) | |
return pca_data, selected_features | |
# Set up the Streamlit page | |
st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models") | |
# Upload the detectors report CSV file | |
data_file = st.file_uploader("Upload the detectors report file (.csv)", type="csv") | |
# Upload the models | |
kmeans_model = st.file_uploader("Upload the K-means model (.sav)", type="sav") | |
hierarchical_model = st.file_uploader("Upload the Hierarchical Clustering model (.sav)", type="sav") | |
dbscan_model = st.file_uploader("Upload the DBSCAN model (.sav)", type="sav") | |
# Parameter selection for K-means, Hierarchical Clustering, and DBSCAN | |
if data_file is not None: | |
st.sidebar.header("Adjust Clustering Parameters") | |
# Scaler selection | |
scaler_option = st.sidebar.selectbox("Choose Scaler", ("StandardScaler", "MinMaxScaler")) | |
# K-means parameters | |
kmeans_clusters = st.sidebar.slider("K-means: Number of Clusters", min_value=2, max_value=10, value=3) | |
# Hierarchical Clustering parameters | |
hierarchical_clusters = st.sidebar.slider("Hierarchical: Number of Clusters", min_value=2, max_value=10, value=3) | |
linkage = st.sidebar.selectbox("Hierarchical: Linkage Method", ["ward", "complete", "average", "single"]) | |
# DBSCAN parameters | |
dbscan_eps = st.sidebar.number_input("DBSCAN: Epsilon", min_value=0.1, max_value=10.0, value=0.5, step=0.1) | |
dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5) | |
# Load and process the data | |
pca_data, selected_features = process_data(data_file, scaler_option) | |
if pca_data is not None: | |
st.write(f"Selected features after VIF filtering: {selected_features}") | |
# Prepare the plot | |
fig, ax = plt.subplots(1, 3, figsize=(15, 5)) | |
ax = ax.flatten() | |
# K-means Clustering | |
if kmeans_model is not None: | |
kmeans = joblib.load(kmeans_model) | |
kmeans.set_params(n_clusters=kmeans_clusters, n_init='auto') # Set n_init='auto' for newer versions of sklearn | |
kmeans_labels = kmeans.fit_predict(pca_data) | |
ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis') | |
ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})") | |
else: | |
ax[0].set_title("K-means Model Missing") | |
# Hierarchical Clustering | |
if hierarchical_model is not None: | |
hierarchical = joblib.load(hierarchical_model) | |
hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage) | |
hierarchical_labels = hierarchical.fit_predict(pca_data) | |
ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis') | |
ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})") | |
else: | |
ax[1].set_title("Hierarchical Model Missing") | |
# DBSCAN Clustering | |
if dbscan_model is not None: | |
dbscan = joblib.load(dbscan_model) | |
dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples) | |
dbscan_labels = dbscan.fit_predict(pca_data) | |
ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis') | |
ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})") | |
else: | |
ax[2].set_title("DBSCAN Model Missing") | |
# Display the plots | |
st.pyplot(fig) | |
else: | |
st.warning("Data processing failed due to VIF constraints.") | |
else: | |
st.info("Please upload the detectors report file to proceed.") | |