import streamlit as st import umap import hdbscan import plotly.express as px from sklearn.datasets import fetch_openml import numpy as np import matplotlib.pyplot as plt import pandas as pd # Function to load and cache the MNIST data @st.cache_data def load_data(): mnist = fetch_openml('mnist_784', version=1) data_df = pd.DataFrame(mnist.data) data_sample = data_df.sample(frac=0.2, random_state=42) # Drop data_df from memory del data_df labels_sample = mnist.target.loc[data_sample.index].astype(int).reset_index(drop=True) data_sample = data_sample.to_numpy() return data_sample, labels_sample # Function to reduce dimensions and cache the results @st.cache_data def reduce_dimensions(data, n_components, n_neighbors, min_dist): reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist) return reducer.fit_transform(data) # Function to perform and cache HDBSCAN clustering @st.cache_data def perform_clustering(data, min_samples, min_cluster_size): clusterer = hdbscan.HDBSCAN(min_samples=min_samples, min_cluster_size=min_cluster_size) return clusterer.fit_predict(data) # Streamlit app title st.title('Interactive UMAP and HDBSCAN Visualization on MNIST Dataset') # Display a brief description st.markdown(""" This app demonstrates the use of UMAP for dimensionality reduction and HDBSCAN for clustering on the MNIST dataset. Use the sliders to adjust UMAP parameters and observe how they influence the data representation and clustering. NOTE: This is not the best way to predict the MNIST but a good representation off how to do visulisze and class high dimensional clusters. The data shown is a subset off the original MNIST set at 0.2 """) # Load data data, labels = load_data() # Display original MNIST images st.subheader('Sample Original MNIST Images') fig, ax = plt.subplots(4, 4, figsize=(5, 5)) for i, axi in enumerate(ax.flat): axi.imshow(data[i].reshape(28, 28), cmap='gray') axi.set(xticks=[], yticks=[], xlabel=f"Digit: {labels[i]}") plt.tight_layout() st.pyplot(fig) with st.sidebar: with st.form("my_form", border=False): st.header('UMAP Parameters') n_neighbors = st.slider('Number of neighbors', min_value=5, max_value=100, value=15, step=5,help="Number of Neighbors (n_neighbors): This parameter controls how UMAP balances local versus global structure in the data. It determines the number of neighboring points used in the local manifold approximations. A higher value considers more neighbors for a broader view of the data structure, while a lower value focuses on the local neighborhood, emphasizing finer details.") min_dist = st.slider('Minimum distance', min_value=0.0, max_value=0.99, value=0.1, step=0.01,help="Minimum Distance (min_dist): This parameter controls how tightly UMAP is allowed to pack points together. It determines the minimum distance between points in the low-dimensional representation. A lower value allows UMAP to pack points more tightly, while a higher value spreads points out more evenly.") # HDBSCAN parameters adjustable via sliders st.header('HDBSCAN Parameters') min_cluster_size = st.slider('Minimum cluster size', min_value=5, max_value=200, value=30, step=5,help="Minimum Cluster Size (min_cluster_size): This parameter sets the minimum size of clusters. It determines the smallest number of points that can form a cluster. A higher value will result in fewer clusters, while a lower value will result in more clusters.") min_samples = st.slider('Minimum samples', min_value=1, max_value=20, value=5, step=1,help="Minimum Samples (min_samples): This parameter sets the number of samples in a neighborhood for a point to be considered as a core point. It determines the minimum number of points required to form a cluster. A higher value will result in fewer points being considered as core points, while a lower value will result in more points being considered as core points.") submitted_hdbscan = st.form_submit_button("Start HDBScan", type="primary") submitted_umap = st.form_submit_button("Start Umap", type="primary") if not submitted_umap: st.error("Press start Umap to start") st.stop() # UMAP Reduction st.subheader('UMAP Dimensionality Reduction',help="UMAP (Uniform Manifold Approximation and Projection) is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction. The algorithm is founded on three assumptions about the data. First, UMAP assumes that the data is uniformly distributed on Riemannian manifold. Second, UMAP assumes that the Riemannian metric is locally constant or can be approximated as such. Third, UMAP assumes that the manifold is locally connected. From these assumptions, UMAP then constructs a low-dimensional representation of the data that has these three properties. UMAP is a powerful tool for visualizing and understanding the structure of high-dimensional data.") data_2d = reduce_dimensions(data, 2, n_neighbors, min_dist) fig_2d = px.scatter(x=data_2d[:, 0], y=data_2d[:, 1], color=labels, title="2D UMAP", color_continuous_scale=px.colors.qualitative.Set1) st.plotly_chart(fig_2d) # HDBSCAN Clustering st.subheader('HDBSCAN Clustering', help="HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm. It is based on the idea that clusters are dense groups of points separated by regions of lower density. The algorithm finds clusters by looking for areas of the data that have a high density of points, separated by areas of low density. It is particularly useful for finding clusters of varying density in large spatial data. HDBSCAN is a powerful tool for clustering and visualizing high-dimensional data.") clusters = perform_clustering(data_2d, min_samples, min_cluster_size) fig_cluster = px.scatter(x=data_2d[:, 0], y=data_2d[:, 1], color=clusters, title="HDBSCAN Clusters", color_continuous_scale=px.colors.qualitative.Set1) st.plotly_chart(fig_cluster)