Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler, MinMaxScaler | |
| from sklearn.decomposition import PCA | |
| from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN | |
| from sklearn.metrics import silhouette_score | |
| import joblib | |
| import matplotlib.pyplot as plt | |
| from statsmodels.stats.outliers_influence import variance_inflation_factor | |
| # Function to calculate VIF and filter features with VIF < 10 | |
| def calculate_vif(df): | |
| vif_data = pd.DataFrame() | |
| vif_data['feature'] = df.columns | |
| vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])] | |
| return vif_data[vif_data['VIF'] < 10]['feature'].tolist() | |
| # Function to load and process data (including VIF and PCA) | |
| def process_data(file, scaler_option): | |
| df = pd.read_csv(file) | |
| # Select only numeric columns for VIF calculation | |
| df_numeric = df.select_dtypes(include=[np.number]) | |
| # Handle missing values by filling them with the mean of the respective columns | |
| df_numeric = df_numeric.fillna(df_numeric.mean()) | |
| # Calculate VIF and filter features with VIF < 10 | |
| selected_features = calculate_vif(df_numeric) | |
| if not selected_features: | |
| st.error("No features with VIF < 10 found. Please review the data.") | |
| return None, None | |
| df_filtered = df_numeric[selected_features] | |
| # Apply chosen scaler | |
| if scaler_option == 'StandardScaler': | |
| scaler = StandardScaler() | |
| elif scaler_option == 'MinMaxScaler': | |
| scaler = MinMaxScaler() | |
| scaled_data = scaler.fit_transform(df_filtered) | |
| # PCA Transformation (2 components for visualization) | |
| pca = PCA(n_components=2) | |
| pca_data = pca.fit_transform(scaled_data) | |
| return pca_data, selected_features | |
| # Set up the Streamlit page | |
| st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models") | |
| # Upload the detectors report CSV file | |
| data_file = st.file_uploader("Upload the detectors report file (.csv)", type="csv") | |
| # Upload the models | |
| kmeans_model = st.file_uploader("Upload the K-means model (.sav)", type="sav") | |
| hierarchical_model = st.file_uploader("Upload the Hierarchical Clustering model (.sav)", type="sav") | |
| dbscan_model = st.file_uploader("Upload the DBSCAN model (.sav)", type="sav") | |
| # Parameter selection for K-means, Hierarchical Clustering, and DBSCAN | |
| if data_file is not None: | |
| st.sidebar.header("Adjust Clustering Parameters") | |
| # Scaler selection | |
| scaler_option = st.sidebar.selectbox("Choose Scaler", ("StandardScaler", "MinMaxScaler")) | |
| # K-means parameters | |
| kmeans_clusters = st.sidebar.slider("K-means: Number of Clusters", min_value=2, max_value=10, value=3) | |
| # Hierarchical Clustering parameters | |
| hierarchical_clusters = st.sidebar.slider("Hierarchical: Number of Clusters", min_value=2, max_value=10, value=3) | |
| linkage = st.sidebar.selectbox("Hierarchical: Linkage Method", ["ward", "complete", "average", "single"]) | |
| # DBSCAN parameters | |
| dbscan_eps = st.sidebar.number_input("DBSCAN: Epsilon", min_value=0.1, max_value=10.0, value=0.5, step=0.1) | |
| dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5) | |
| # Load and process the data | |
| pca_data, selected_features = process_data(data_file, scaler_option) | |
| if pca_data is not None: | |
| st.write(f"Selected features after VIF filtering: {selected_features}") | |
| # Prepare the plot | |
| fig, ax = plt.subplots(1, 3, figsize=(15, 5)) | |
| ax = ax.flatten() | |
| # K-means Clustering | |
| if kmeans_model is not None: | |
| kmeans = joblib.load(kmeans_model) | |
| kmeans.set_params(n_clusters=kmeans_clusters, n_init='auto') # Set n_init='auto' for newer versions of sklearn | |
| kmeans_labels = kmeans.fit_predict(pca_data) | |
| ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis') | |
| ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})") | |
| else: | |
| ax[0].set_title("K-means Model Missing") | |
| # Hierarchical Clustering | |
| if hierarchical_model is not None: | |
| hierarchical = joblib.load(hierarchical_model) | |
| hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage) | |
| hierarchical_labels = hierarchical.fit_predict(pca_data) | |
| ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis') | |
| ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})") | |
| else: | |
| ax[1].set_title("Hierarchical Model Missing") | |
| # DBSCAN Clustering | |
| if dbscan_model is not None: | |
| dbscan = joblib.load(dbscan_model) | |
| dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples) | |
| dbscan_labels = dbscan.fit_predict(pca_data) | |
| ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis') | |
| ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})") | |
| else: | |
| ax[2].set_title("DBSCAN Model Missing") | |
| # Display the plots | |
| st.pyplot(fig) | |
| else: | |
| st.warning("Data processing failed due to VIF constraints.") | |
| else: | |
| st.info("Please upload the detectors report file to proceed.") | |