Spaces:
Sleeping
Sleeping
import hdbscan | |
import umap | |
import numpy as np | |
import pandas as pd | |
import matplotlib.pyplot as plt | |
def load_data(): | |
# Load data | |
embeddings = np.load(r'data\top_cluster_embeddings.npy') | |
return embeddings | |
def get_clusters(embeddings): | |
# Get clusters | |
umap_embeddings = umap.UMAP( | |
n_neighbors=15, | |
n_components=15, | |
metric='cosine' | |
).fit_transform(embeddings) | |
cluster = hdbscan.HDBSCAN( | |
min_cluster_size=30, | |
metric='euclidean', | |
cluster_selection_method='eom' | |
).fit(umap_embeddings) | |
return cluster.labels_ | |
def get_2d_data_for_plotting(embeddings): | |
# Get 2D data for plotting | |
umap_embeddings = umap.UMAP( | |
n_neighbors=15, | |
n_components=2, | |
metric='cosine' | |
).fit_transform(embeddings) | |
return umap_embeddings | |
def plot_clusters(embeddings, cluster_labels): | |
umap_data = get_2d_data_for_plotting(embeddings) | |
result = pd.DataFrame(umap_data, columns=['x', 'y']) | |
result['labels'] = cluster_labels | |
# Visualize clusters | |
fig, ax = plt.subplots(figsize=(20, 10)) | |
outliers = result.loc[result.labels == -1, :] | |
clustered = result.loc[result.labels != -1, :] | |
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05) | |
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r') | |
plt.colorbar() | |
plt.savefig(r'plots\clusters.png', dpi=300) | |
def main(): | |
embeddings = load_data() | |
cluster_labels = get_clusters(embeddings) | |
plot_clusters(embeddings, cluster_labels) | |
if __name__ == '__main__': | |
main() |