Spaces:

Qarlsberg
/

UmapHDBscanTester

Sleeping

App Files Files Community

Qarlsberg commited on Feb 27

Commit

5c08d0a

•

1 Parent(s): 7170205

Create app.py

Browse files

Files changed (1) hide show

app.py +84 -0

app.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import streamlit as st
+import umap
+import hdbscan
+import plotly.express as px
+from sklearn.datasets import fetch_openml
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+# Function to load and cache the MNIST data
+@st.cache_data
+def load_data():
+    mnist = fetch_openml('mnist_784', version=1)
+    data_df = pd.DataFrame(mnist.data)
+    data_sample = data_df.sample(frac=0.2, random_state=42)
+    # Drop data_df from memory
+    del data_df
+    labels_sample = mnist.target.loc[data_sample.index].astype(int).reset_index(drop=True)
+    data_sample = data_sample.to_numpy()
+    return data_sample, labels_sample
+# Function to reduce dimensions and cache the results
+@st.cache_data
+def reduce_dimensions(data, n_components, n_neighbors, min_dist):
+    reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, random_state=42)
+    return reducer.fit_transform(data)
+# Function to perform and cache HDBSCAN clustering
+@st.cache_data
+def perform_clustering(data, min_samples, min_cluster_size):
+    clusterer = hdbscan.HDBSCAN(min_samples=min_samples, min_cluster_size=min_cluster_size)
+    return clusterer.fit_predict(data)
+# Streamlit app title
+st.title('Interactive UMAP and HDBSCAN Visualization on MNIST Dataset')
+# Display a brief description
+st.markdown("""
+This app demonstrates the use of UMAP for dimensionality reduction and HDBSCAN for clustering on the MNIST dataset.
+Use the sliders to adjust UMAP parameters and observe how they influence the data representation and clustering.
+""")
+# Load data
+data, labels = load_data()
+# Display original MNIST images
+st.subheader('Sample Original MNIST Images')
+fig, ax = plt.subplots(4, 4, figsize=(5, 5))
+for i, axi in enumerate(ax.flat):
+    axi.imshow(data[i].reshape(28, 28), cmap='gray')
+    axi.set(xticks=[], yticks=[], xlabel=f"Digit: {labels[i]}")
+plt.tight_layout()
+st.pyplot(fig)
+with st.sidebar:
+    with st.form("my_form", border=False):
+        st.header('UMAP Parameters')
+        n_neighbors = st.slider('Number of neighbors', min_value=5, max_value=100, value=15, step=5,help="Number of Neighbors (n_neighbors): This parameter controls how UMAP balances local versus global structure in the data. It determines the number of neighboring points used in the local manifold approximations. A higher value considers more neighbors for a broader view of the data structure, while a lower value focuses on the local neighborhood, emphasizing finer details.")
+        min_dist = st.slider('Minimum distance', min_value=0.0, max_value=0.99, value=0.1, step=0.01,help="Minimum Distance (min_dist): This parameter controls how tightly UMAP is allowed to pack points together. It determines the minimum distance between points in the low-dimensional representation. A lower value allows UMAP to pack points more tightly, while a higher value spreads points out more evenly.")
+        # HDBSCAN parameters adjustable via sliders
+        st.header('HDBSCAN Parameters')
+        min_cluster_size = st.slider('Minimum cluster size', min_value=5, max_value=200, value=30, step=5,help="Minimum Cluster Size (min_cluster_size): This parameter sets the minimum size of clusters. It determines the smallest number of points that can form a cluster. A higher value will result in fewer clusters, while a lower value will result in more clusters.")
+        min_samples = st.slider('Minimum samples', min_value=1, max_value=20, value=5, step=1,help="Minimum Samples (min_samples): This parameter sets the number of samples in a neighborhood for a point to be considered as a core point. It determines the minimum number of points required to form a cluster. A higher value will result in fewer points being considered as core points, while a lower value will result in more points being considered as core points.")
+        submitted = st.form_submit_button("Start", type="primary")
+if not submitted:
+    st.error("Press start to run the models")
+    st.stop()
+# UMAP Reduction
+st.subheader('UMAP Dimensionality Reduction',help="UMAP (Uniform Manifold Approximation and Projection) is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction. The algorithm is founded on three assumptions about the data. First, UMAP assumes that the data is uniformly distributed on Riemannian manifold. Second, UMAP assumes that the Riemannian metric is locally constant or can be approximated as such. Third, UMAP assumes that the manifold is locally connected. From these assumptions, UMAP then constructs a low-dimensional representation of the data that has these three properties. UMAP is a powerful tool for visualizing and understanding the structure of high-dimensional data.")
+data_2d = reduce_dimensions(data, 2, n_neighbors, min_dist)
+fig_2d = px.scatter(x=data_2d[:, 0], y=data_2d[:, 1], color=labels, title="2D UMAP", color_continuous_scale=px.colors.qualitative.Set1)
+st.plotly_chart(fig_2d)
+# HDBSCAN Clustering
+st.subheader('HDBSCAN Clustering', help="HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm. It is based on the idea that clusters are dense groups of points separated by regions of lower density. The algorithm finds clusters by looking for areas of the data that have a high density of points, separated by areas of low density. It is particularly useful for finding clusters of varying density in large spatial data. HDBSCAN is a powerful tool for clustering and visualizing high-dimensional data.")
+clusters = perform_clustering(data_2d, min_samples, min_cluster_size)
+fig_cluster = px.scatter(x=data_2d[:, 0], y=data_2d[:, 1], color=clusters, title="HDBSCAN Clusters", color_continuous_scale=px.colors.qualitative.Set1)
+st.plotly_chart(fig_cluster)