Qarlsberg's picture
Create app.py
5c08d0a verified
raw
history blame
5.84 kB
import streamlit as st
import umap
import hdbscan
import plotly.express as px
from sklearn.datasets import fetch_openml
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Function to load and cache the MNIST data
@st.cache_data
def load_data():
mnist = fetch_openml('mnist_784', version=1)
data_df = pd.DataFrame(mnist.data)
data_sample = data_df.sample(frac=0.2, random_state=42)
# Drop data_df from memory
del data_df
labels_sample = mnist.target.loc[data_sample.index].astype(int).reset_index(drop=True)
data_sample = data_sample.to_numpy()
return data_sample, labels_sample
# Function to reduce dimensions and cache the results
@st.cache_data
def reduce_dimensions(data, n_components, n_neighbors, min_dist):
reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, random_state=42)
return reducer.fit_transform(data)
# Function to perform and cache HDBSCAN clustering
@st.cache_data
def perform_clustering(data, min_samples, min_cluster_size):
clusterer = hdbscan.HDBSCAN(min_samples=min_samples, min_cluster_size=min_cluster_size)
return clusterer.fit_predict(data)
# Streamlit app title
st.title('Interactive UMAP and HDBSCAN Visualization on MNIST Dataset')
# Display a brief description
st.markdown("""
This app demonstrates the use of UMAP for dimensionality reduction and HDBSCAN for clustering on the MNIST dataset.
Use the sliders to adjust UMAP parameters and observe how they influence the data representation and clustering.
""")
# Load data
data, labels = load_data()
# Display original MNIST images
st.subheader('Sample Original MNIST Images')
fig, ax = plt.subplots(4, 4, figsize=(5, 5))
for i, axi in enumerate(ax.flat):
axi.imshow(data[i].reshape(28, 28), cmap='gray')
axi.set(xticks=[], yticks=[], xlabel=f"Digit: {labels[i]}")
plt.tight_layout()
st.pyplot(fig)
with st.sidebar:
with st.form("my_form", border=False):
st.header('UMAP Parameters')
n_neighbors = st.slider('Number of neighbors', min_value=5, max_value=100, value=15, step=5,help="Number of Neighbors (n_neighbors): This parameter controls how UMAP balances local versus global structure in the data. It determines the number of neighboring points used in the local manifold approximations. A higher value considers more neighbors for a broader view of the data structure, while a lower value focuses on the local neighborhood, emphasizing finer details.")
min_dist = st.slider('Minimum distance', min_value=0.0, max_value=0.99, value=0.1, step=0.01,help="Minimum Distance (min_dist): This parameter controls how tightly UMAP is allowed to pack points together. It determines the minimum distance between points in the low-dimensional representation. A lower value allows UMAP to pack points more tightly, while a higher value spreads points out more evenly.")
# HDBSCAN parameters adjustable via sliders
st.header('HDBSCAN Parameters')
min_cluster_size = st.slider('Minimum cluster size', min_value=5, max_value=200, value=30, step=5,help="Minimum Cluster Size (min_cluster_size): This parameter sets the minimum size of clusters. It determines the smallest number of points that can form a cluster. A higher value will result in fewer clusters, while a lower value will result in more clusters.")
min_samples = st.slider('Minimum samples', min_value=1, max_value=20, value=5, step=1,help="Minimum Samples (min_samples): This parameter sets the number of samples in a neighborhood for a point to be considered as a core point. It determines the minimum number of points required to form a cluster. A higher value will result in fewer points being considered as core points, while a lower value will result in more points being considered as core points.")
submitted = st.form_submit_button("Start", type="primary")
if not submitted:
st.error("Press start to run the models")
st.stop()
# UMAP Reduction
st.subheader('UMAP Dimensionality Reduction',help="UMAP (Uniform Manifold Approximation and Projection) is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction. The algorithm is founded on three assumptions about the data. First, UMAP assumes that the data is uniformly distributed on Riemannian manifold. Second, UMAP assumes that the Riemannian metric is locally constant or can be approximated as such. Third, UMAP assumes that the manifold is locally connected. From these assumptions, UMAP then constructs a low-dimensional representation of the data that has these three properties. UMAP is a powerful tool for visualizing and understanding the structure of high-dimensional data.")
data_2d = reduce_dimensions(data, 2, n_neighbors, min_dist)
fig_2d = px.scatter(x=data_2d[:, 0], y=data_2d[:, 1], color=labels, title="2D UMAP", color_continuous_scale=px.colors.qualitative.Set1)
st.plotly_chart(fig_2d)
# HDBSCAN Clustering
st.subheader('HDBSCAN Clustering', help="HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm. It is based on the idea that clusters are dense groups of points separated by regions of lower density. The algorithm finds clusters by looking for areas of the data that have a high density of points, separated by areas of low density. It is particularly useful for finding clusters of varying density in large spatial data. HDBSCAN is a powerful tool for clustering and visualizing high-dimensional data.")
clusters = perform_clustering(data_2d, min_samples, min_cluster_size)
fig_cluster = px.scatter(x=data_2d[:, 0], y=data_2d[:, 1], color=clusters, title="HDBSCAN Clusters", color_continuous_scale=px.colors.qualitative.Set1)
st.plotly_chart(fig_cluster)