import streamlit as st from streamlit_option_menu import option_menu import pandas as pd from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score import plotly.express as px st.set_page_config(layout="wide") st.title("Student Behavior Clustering 📊✨") st.write("This app performs clustering on student behavior data to identify patterns and segments of students.") # Two option menus: App, About tabs = ["App", "About"] app_mode = option_menu(None, options=tabs, icons=["📊", "❓"], default_index=0, orientation="horizontal") # --- Sidebar for Settings and File Upload --- st.sidebar.header("Data and Clustering Settings") # File Upload uploaded_file = st.sidebar.file_uploader( "Choose a CSV file or use default:", type=["csv"] ) # Use a default dataset if no file is uploaded if uploaded_file is None: df = pd.read_csv("clustering_data.csv") else: df = pd.read_csv(uploaded_file) # --- Data Preprocessing (Example: Handling Missing Values) --- # Replace this with your specific data cleaning needs df.fillna(df.mean(), inplace=True) # --- Feature Engineering (Example) --- df['engagement_score'] = ( df['attendance_rate'] * 0.5 + df['test_average'] * 0.5 ) # Select features for clustering features = df[['attendance_rate', 'test_average', 'engagement_score']] # Standard Scaling scaler = StandardScaler() scaled_features = scaler.fit_transform(features) # Sidebar for Algorithm Selection and Parameter Tuning st.sidebar.header("Clustering Settings") algorithm = st.sidebar.selectbox( "Select Algorithm:", ("KMeans", "DBSCAN", "Hierarchical") ) # Default values for parameters n_clusters_kmeans = 3 eps = 0.5 min_samples = 5 n_clusters_hierarchical = 3 linkage = 'ward' # Parameter tuning section with st.sidebar.expander("Algorithm Parameters"): if algorithm == "KMeans": n_clusters_kmeans = st.slider( "Number of Clusters (K)", 2, 10, 3, help="Number of clusters to form for KMeans." ) elif algorithm == "DBSCAN": eps = st.slider( "Epsilon (eps)", 0.1, 2.0, 0.5, 0.1, help="Maximum distance between two samples for one to be considered as in the neighborhood of the other for DBSCAN." ) min_samples = st.slider( "Min Samples", 2, 10, 5, help="The number of samples in a neighborhood for a point to be considered as a core point for DBSCAN." ) else: # Hierarchical n_clusters_hierarchical = st.slider( "Number of Clusters", 2, 10, 3, help="Number of clusters to find for hierarchical clustering." ) linkage = st.selectbox( "Linkage", ['ward', 'complete', 'average', 'single'], help="Which linkage criterion to use for hierarchical clustering." ) # Function to perform clustering def cluster_data(algo_name, **kwargs): try: if algo_name == "KMeans": model = KMeans(n_clusters=kwargs.get('n_clusters', 3), random_state=42) elif algo_name == "DBSCAN": model = DBSCAN(eps=kwargs.get('eps', 0.5), min_samples=kwargs.get('min_samples', 5)) else: # Hierarchical model = AgglomerativeClustering( n_clusters=kwargs.get('n_clusters', 3), linkage=kwargs.get('linkage', 'ward') ) clusters = model.fit_predict(scaled_features) return clusters except Exception as e: st.error(f"An error occurred during clustering: {e}") return None # Perform clustering clusters = cluster_data( algorithm, n_clusters=n_clusters_kmeans if algorithm == "KMeans" else n_clusters_hierarchical, eps=eps if algorithm == "DBSCAN" else 0.5, min_samples=min_samples if algorithm == "DBSCAN" else 5, linkage=linkage if algorithm == "Hierarchical" else "ward", ) # THE APP CONTENT if app_mode == "About": st.write( """ ## About This app performs clustering on student behavior data to identify patterns and segments of students. ### Data The dataset contains student information such as attendance rate, test average, and engagement score. ### Clustering Algorithms - **KMeans:** Partitions data into K clusters based on feature similarity. - **DBSCAN:** Density-based clustering to identify outliers and clusters of varying shapes. - **Hierarchical:** Builds a tree of clusters to identify subgroups. ### Evaluation Metrics - **Silhouette Score:** Measures how similar an object is to its cluster compared to other clusters. - **Davies-Bouldin Index:** Computes the average similarity between each cluster and its most similar one. - **Calinski-Harabasz Index:** Ratio of the sum of between-clusters dispersion and within-cluster dispersion. ### Cluster Profiling - Parallel coordinates plot to visualize and compare clusters across multiple features. ### Interpretation of Clusters - Provides insights into each cluster based on the average values of features. """ ) st.write( """ ## How to Use 1. **Upload Data:** Upload your own CSV file or use the default dataset. 2. **Select Algorithm:** Choose between KMeans, DBSCAN, and Hierarchical clustering. 3. **Set Parameters:** Adjust the clustering parameters in the sidebar. 4. **Interpret Results:** Explore the clustered data, evaluation metrics, and cluster profiles. """ ) st.write( """ ## Contact If you have any questions or feedback, feel free to connect with me on: - [LinkedIn](https://www.linkedin.com/in/abdellatif-laghjaj) - [GitHub](https://www.github.com/abdellatif-laghjaj) """ ) elif app_mode == "App": if clusters is not None: df['cluster'] = clusters # --- Display Clustered Data --- st.subheader(f"Clustered Data using {algorithm}:") st.dataframe(df) # --- Evaluation Metrics --- if len(set(clusters)) > 1: silhouette_avg = silhouette_score(scaled_features, clusters) db_index = davies_bouldin_score(scaled_features, clusters) ch_index = calinski_harabasz_score(scaled_features, clusters) st.subheader("Clustering Evaluation Metrics") st.markdown(f"**Silhouette Score:** {silhouette_avg:.2f}", unsafe_allow_html=True) st.markdown(f"**Davies-Bouldin Index:** {db_index:.2f}", unsafe_allow_html=True) st.markdown(f"**Calinski-Harabasz Index:** {ch_index:.2f}", unsafe_allow_html=True) else: st.warning("Evaluation metrics are not applicable. Only one cluster found.") # --- Interactive 3D Scatter Plot with Plotly --- st.subheader("Interactive 3D Cluster Visualization") fig = px.scatter_3d( df, x='attendance_rate', y='test_average', z='engagement_score', color='cluster', title=f"Student Clusters ({algorithm})", labels={'attendance_rate': 'Attendance Rate', 'test_average': 'Test Average', 'engagement_score': 'Engagement Score'} ) st.plotly_chart(fig) # --- Cluster Profiling (Example using Plotly) --- st.subheader("Cluster Profile Visualization") st.write("The parallel coordinates plot is a way to visualize and compare clusters across multiple features.") profile_features = ['attendance_rate', 'test_average', 'engagement_score'] cluster_means = df.groupby('cluster')[profile_features].mean().reset_index() fig_profile = px.parallel_coordinates( cluster_means, color='cluster', dimensions=profile_features, title="Parallel Coordinates Plot for Cluster Profiles" ) st.plotly_chart(fig_profile) # --- Dynamic Interpretation of Clusters --- st.subheader("Interpretation of Clusters") for cluster_num in cluster_means['cluster']: cluster_data = cluster_means[cluster_means['cluster'] == cluster_num] st.write(f"**Cluster {cluster_num}:**") for feature in profile_features: st.write(f"- **{feature.replace('_', ' ').title()}:** {cluster_data[feature].values[0]:.2f}") highest_feature = cluster_data[profile_features].idxmax(axis=1).values[0] lowest_feature = cluster_data[profile_features].idxmin(axis=1).values[0] st.write(f"This cluster has the highest average {highest_feature.replace('_', ' ')} " f"and the lowest average {lowest_feature.replace('_', ' ')}.") st.write("---") # Additional insights based on cluster characteristics can be added here. else: st.warning("Please configure the clustering settings and run the algorithm first.")