Qarlsberg commited on
Commit
5c08d0a
1 Parent(s): 7170205

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import umap
3
+ import hdbscan
4
+ import plotly.express as px
5
+ from sklearn.datasets import fetch_openml
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ import pandas as pd
9
+
10
+
11
+
12
+ # Function to load and cache the MNIST data
13
+ @st.cache_data
14
+ def load_data():
15
+ mnist = fetch_openml('mnist_784', version=1)
16
+ data_df = pd.DataFrame(mnist.data)
17
+ data_sample = data_df.sample(frac=0.2, random_state=42)
18
+ # Drop data_df from memory
19
+ del data_df
20
+ labels_sample = mnist.target.loc[data_sample.index].astype(int).reset_index(drop=True)
21
+ data_sample = data_sample.to_numpy()
22
+ return data_sample, labels_sample
23
+ # Function to reduce dimensions and cache the results
24
+ @st.cache_data
25
+ def reduce_dimensions(data, n_components, n_neighbors, min_dist):
26
+ reducer = umap.UMAP(n_components=n_components, n_neighbors=n_neighbors, min_dist=min_dist, random_state=42)
27
+ return reducer.fit_transform(data)
28
+
29
+ # Function to perform and cache HDBSCAN clustering
30
+ @st.cache_data
31
+ def perform_clustering(data, min_samples, min_cluster_size):
32
+ clusterer = hdbscan.HDBSCAN(min_samples=min_samples, min_cluster_size=min_cluster_size)
33
+ return clusterer.fit_predict(data)
34
+
35
+ # Streamlit app title
36
+ st.title('Interactive UMAP and HDBSCAN Visualization on MNIST Dataset')
37
+
38
+ # Display a brief description
39
+ st.markdown("""
40
+ This app demonstrates the use of UMAP for dimensionality reduction and HDBSCAN for clustering on the MNIST dataset.
41
+ Use the sliders to adjust UMAP parameters and observe how they influence the data representation and clustering.
42
+ """)
43
+
44
+ # Load data
45
+ data, labels = load_data()
46
+
47
+ # Display original MNIST images
48
+ st.subheader('Sample Original MNIST Images')
49
+ fig, ax = plt.subplots(4, 4, figsize=(5, 5))
50
+ for i, axi in enumerate(ax.flat):
51
+ axi.imshow(data[i].reshape(28, 28), cmap='gray')
52
+ axi.set(xticks=[], yticks=[], xlabel=f"Digit: {labels[i]}")
53
+ plt.tight_layout()
54
+ st.pyplot(fig)
55
+
56
+ with st.sidebar:
57
+ with st.form("my_form", border=False):
58
+
59
+
60
+ st.header('UMAP Parameters')
61
+ n_neighbors = st.slider('Number of neighbors', min_value=5, max_value=100, value=15, step=5,help="Number of Neighbors (n_neighbors): This parameter controls how UMAP balances local versus global structure in the data. It determines the number of neighboring points used in the local manifold approximations. A higher value considers more neighbors for a broader view of the data structure, while a lower value focuses on the local neighborhood, emphasizing finer details.")
62
+ min_dist = st.slider('Minimum distance', min_value=0.0, max_value=0.99, value=0.1, step=0.01,help="Minimum Distance (min_dist): This parameter controls how tightly UMAP is allowed to pack points together. It determines the minimum distance between points in the low-dimensional representation. A lower value allows UMAP to pack points more tightly, while a higher value spreads points out more evenly.")
63
+
64
+ # HDBSCAN parameters adjustable via sliders
65
+ st.header('HDBSCAN Parameters')
66
+ min_cluster_size = st.slider('Minimum cluster size', min_value=5, max_value=200, value=30, step=5,help="Minimum Cluster Size (min_cluster_size): This parameter sets the minimum size of clusters. It determines the smallest number of points that can form a cluster. A higher value will result in fewer clusters, while a lower value will result in more clusters.")
67
+ min_samples = st.slider('Minimum samples', min_value=1, max_value=20, value=5, step=1,help="Minimum Samples (min_samples): This parameter sets the number of samples in a neighborhood for a point to be considered as a core point. It determines the minimum number of points required to form a cluster. A higher value will result in fewer points being considered as core points, while a lower value will result in more points being considered as core points.")
68
+
69
+ submitted = st.form_submit_button("Start", type="primary")
70
+ if not submitted:
71
+ st.error("Press start to run the models")
72
+ st.stop()
73
+
74
+ # UMAP Reduction
75
+ st.subheader('UMAP Dimensionality Reduction',help="UMAP (Uniform Manifold Approximation and Projection) is a dimension reduction technique that can be used for visualisation similarly to t-SNE, but also for general non-linear dimension reduction. The algorithm is founded on three assumptions about the data. First, UMAP assumes that the data is uniformly distributed on Riemannian manifold. Second, UMAP assumes that the Riemannian metric is locally constant or can be approximated as such. Third, UMAP assumes that the manifold is locally connected. From these assumptions, UMAP then constructs a low-dimensional representation of the data that has these three properties. UMAP is a powerful tool for visualizing and understanding the structure of high-dimensional data.")
76
+ data_2d = reduce_dimensions(data, 2, n_neighbors, min_dist)
77
+ fig_2d = px.scatter(x=data_2d[:, 0], y=data_2d[:, 1], color=labels, title="2D UMAP", color_continuous_scale=px.colors.qualitative.Set1)
78
+ st.plotly_chart(fig_2d)
79
+
80
+ # HDBSCAN Clustering
81
+ st.subheader('HDBSCAN Clustering', help="HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm. It is based on the idea that clusters are dense groups of points separated by regions of lower density. The algorithm finds clusters by looking for areas of the data that have a high density of points, separated by areas of low density. It is particularly useful for finding clusters of varying density in large spatial data. HDBSCAN is a powerful tool for clustering and visualizing high-dimensional data.")
82
+ clusters = perform_clustering(data_2d, min_samples, min_cluster_size)
83
+ fig_cluster = px.scatter(x=data_2d[:, 0], y=data_2d[:, 1], color=clusters, title="HDBSCAN Clusters", color_continuous_scale=px.colors.qualitative.Set1)
84
+ st.plotly_chart(fig_cluster)