Indus_err / app.py
Spencer525's picture
Update app.py
7f38fb4 verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
import joblib
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Function to calculate VIF and filter features with VIF < 10
def calculate_vif(df):
vif_data = pd.DataFrame()
vif_data['feature'] = df.columns
vif_data['VIF'] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
return vif_data[vif_data['VIF'] < 10]['feature'].tolist()
# Function to load and process data (including VIF and PCA)
def process_data(file, scaler_option):
df = pd.read_csv(file)
# Select only numeric columns for VIF calculation
df_numeric = df.select_dtypes(include=[np.number])
# Handle missing values by filling them with the mean of the respective columns
df_numeric = df_numeric.fillna(df_numeric.mean())
# Calculate VIF and filter features with VIF < 10
selected_features = calculate_vif(df_numeric)
if not selected_features:
st.error("No features with VIF < 10 found. Please review the data.")
return None, None
df_filtered = df_numeric[selected_features]
# Apply chosen scaler
if scaler_option == 'StandardScaler':
scaler = StandardScaler()
elif scaler_option == 'MinMaxScaler':
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df_filtered)
# PCA Transformation (2 components for visualization)
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
return pca_data, selected_features
# Set up the Streamlit page
st.title("Clustering Analysis with K-means, Hierarchical, and DBSCAN Models")
# Upload the detectors report CSV file
data_file = st.file_uploader("Upload the detectors report file (.csv)", type="csv")
# Upload the models
kmeans_model = st.file_uploader("Upload the K-means model (.sav)", type="sav")
hierarchical_model = st.file_uploader("Upload the Hierarchical Clustering model (.sav)", type="sav")
dbscan_model = st.file_uploader("Upload the DBSCAN model (.sav)", type="sav")
# Parameter selection for K-means, Hierarchical Clustering, and DBSCAN
if data_file is not None:
st.sidebar.header("Adjust Clustering Parameters")
# Scaler selection
scaler_option = st.sidebar.selectbox("Choose Scaler", ("StandardScaler", "MinMaxScaler"))
# K-means parameters
kmeans_clusters = st.sidebar.slider("K-means: Number of Clusters", min_value=2, max_value=10, value=3)
# Hierarchical Clustering parameters
hierarchical_clusters = st.sidebar.slider("Hierarchical: Number of Clusters", min_value=2, max_value=10, value=3)
linkage = st.sidebar.selectbox("Hierarchical: Linkage Method", ["ward", "complete", "average", "single"])
# DBSCAN parameters
dbscan_eps = st.sidebar.number_input("DBSCAN: Epsilon", min_value=0.1, max_value=10.0, value=0.5, step=0.1)
dbscan_min_samples = st.sidebar.slider("DBSCAN: Minimum Samples", min_value=1, max_value=20, value=5)
# Load and process the data
pca_data, selected_features = process_data(data_file, scaler_option)
if pca_data is not None:
st.write(f"Selected features after VIF filtering: {selected_features}")
# Prepare the plot
fig, ax = plt.subplots(1, 3, figsize=(15, 5))
ax = ax.flatten()
# K-means Clustering
if kmeans_model is not None:
kmeans = joblib.load(kmeans_model)
kmeans.set_params(n_clusters=kmeans_clusters, n_init='auto') # Set n_init='auto' for newer versions of sklearn
kmeans_labels = kmeans.fit_predict(pca_data)
ax[0].scatter(pca_data[:, 0], pca_data[:, 1], c=kmeans_labels, cmap='viridis')
ax[0].set_title(f"K-means Clustering (n_clusters={kmeans_clusters})")
else:
ax[0].set_title("K-means Model Missing")
# Hierarchical Clustering
if hierarchical_model is not None:
hierarchical = joblib.load(hierarchical_model)
hierarchical.set_params(n_clusters=hierarchical_clusters, linkage=linkage)
hierarchical_labels = hierarchical.fit_predict(pca_data)
ax[1].scatter(pca_data[:, 0], pca_data[:, 1], c=hierarchical_labels, cmap='viridis')
ax[1].set_title(f"Hierarchical Clustering (n_clusters={hierarchical_clusters}, linkage={linkage})")
else:
ax[1].set_title("Hierarchical Model Missing")
# DBSCAN Clustering
if dbscan_model is not None:
dbscan = joblib.load(dbscan_model)
dbscan.set_params(eps=dbscan_eps, min_samples=dbscan_min_samples)
dbscan_labels = dbscan.fit_predict(pca_data)
ax[2].scatter(pca_data[:, 0], pca_data[:, 1], c=dbscan_labels, cmap='viridis')
ax[2].set_title(f"DBSCAN Clustering (eps={dbscan_eps}, min_samples={dbscan_min_samples})")
else:
ax[2].set_title("DBSCAN Model Missing")
# Display the plots
st.pyplot(fig)
else:
st.warning("Data processing failed due to VIF constraints.")
else:
st.info("Please upload the detectors report file to proceed.")