Browse files- +101 -0
@@ -0,0 +1,101 @@
1 |
import numpy as np
2 |
import matplotlib.pyplot as plt
3 |
from sklearn.cluster import AgglomerativeClustering
4 |
from sklearn.metrics import silhouette_score
5 |
from sklearn.preprocessing import normalize
6 |
import umap
7 |
8 |
synonyms = pd.read_csv('Synonyms.csv')
9 |
10 |
11 |
# Load precomputed embeddings
12 |
news_embeddings = np.load("news_embeddings.npy")
13 |
14 |
# Normalize embeddings
15 |
normalized_embeddings = normalize(news_embeddings)
16 |
17 |
# Apply UMAP for dimensionality reduction
18 |
reducer = umap.UMAP(n_components=10, random_state=42)
19 |
umap_embeddings = reducer.fit_transform(normalized_embeddings)
20 |
21 |
# List to hold silhouette scores
22 |
silhouette_scores = []
23 |
cluster_range = range(2, 20) # You can adjust the range based on your needs
24 |
25 |
# Iterate through different numbers of clusters
26 |
for num_clusters in cluster_range:
27 |
# Perform Agglomerative Clustering
28 |
clustering_model = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
29 |
cluster_labels = clustering_model.fit_predict(umap_embeddings)
30 |
31 |
# Calculate silhouette score
32 |
silhouette_avg = silhouette_score(umap_embeddings, cluster_labels)
33 |
34 |
print(f"Number of Clusters: {num_clusters}, Silhouette Score: {silhouette_avg:.4f}")
35 |
36 |
# Plot silhouette scores for each number of clusters
37 |
plt.figure(figsize=(10, 6))
38 |
plt.plot(cluster_range, silhouette_scores, marker='o')
39 |
plt.title('Silhouette Scores for Different Numbers of Clusters (With UMAP)')
40 |
plt.xlabel('Number of Clusters')
41 |
plt.ylabel('Silhouette Score')
42 |
plt.xticks(cluster_range) # Show all tick marks
43 |
44 |
45 |
46 |
# Determine the optimal number of clusters
47 |
optimal_clusters = cluster_range[np.argmax(silhouette_scores)]
48 |
print(f"The optimal number of clusters is: {optimal_clusters}")
49 |
50 |
51 |
# Load precomputed embeddings
52 |
news_embeddings = np.load("news_embeddings.npy")
53 |
54 |
# Normalize embeddings
55 |
normalized_embeddings = normalize(news_embeddings)
56 |
57 |
# Define the number of clusters
58 |
num_clusters = 7
59 |
60 |
# Perform Agglomerative Clustering without UMAP
61 |
clustering_model_no_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
62 |
cluster_labels_no_umap = clustering_model_no_umap.fit_predict(normalized_embeddings)
63 |
silhouette_no_umap = silhouette_score(normalized_embeddings, cluster_labels_no_umap)
64 |
print(f"Silhouette Score without UMAP: {silhouette_no_umap}")
65 |
66 |
# Visualization of clustering results (using first two dimensions for a simple scatter plot)
67 |
plt.figure(figsize=(10, 7))
68 |
plt.scatter(normalized_embeddings[:, 0], normalized_embeddings[:, 1], c=cluster_labels_no_umap, cmap='viridis', alpha=0.6)
69 |
plt.title(f'Agglomerative Clustering Visualization (Silhouette Score: {silhouette_no_umap:.4f})')
70 |
plt.xlabel('Dimension 1')
71 |
plt.ylabel('Dimension 2')
72 |
plt.colorbar(label='Cluster Label')
73 |
74 |
75 |
# Perform UMAP and visualize
76 |
import umap
77 |
78 |
# Reduce dimensionality to 10 with UMAP (this step is optional for this part)
79 |
reducer = umap.UMAP(n_components=10)
80 |
reduced_embeddings = reducer.fit_transform(normalized_embeddings)
81 |
82 |
# Perform clustering on reduced embeddings
83 |
clustering_model_with_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
84 |
cluster_labels_with_umap = clustering_model_with_umap.fit_predict(reduced_embeddings)
85 |
silhouette_with_umap = silhouette_score(reduced_embeddings, cluster_labels_with_umap)
86 |
print(f"Silhouette Score after UMAP: {silhouette_with_umap}")
87 |
88 |
# Scatter plot of UMAP reduced embeddings
89 |
plt.figure(figsize=(10, 7))
90 |
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels_with_umap, cmap='viridis', alpha=0.6)
91 |
plt.title(f'Agglomerative Clustering Visualization with UMAP (Silhouette Score: {silhouette_with_umap:.4f})')
92 |
plt.xlabel('UMAP Component 1')
93 |
plt.ylabel('UMAP Component 2')
94 |
plt.colorbar(label='Cluster Label')
95 |
96 |
97 |
# Compare scores
98 |
if silhouette_with_umap > silhouette_no_umap:
99 |
print(f"UMAP improved the silhouette score by {silhouette_with_umap - silhouette_no_umap}")
100 |
101 |
print(f"UMAP did not improve the silhouette score. Difference: {silhouette_no_umap - silhouette_with_umap}")