Spaces:
Sleeping
Sleeping
Upload performance_evaluation.py
Browse files- performance_evaluation.py +101 -0
performance_evaluation.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
from sklearn.cluster import AgglomerativeClustering
|
4 |
+
from sklearn.metrics import silhouette_score
|
5 |
+
from sklearn.preprocessing import normalize
|
6 |
+
import umap
|
7 |
+
|
8 |
+
synonyms = pd.read_csv('Synonyms.csv')
|
9 |
+
synonyms.head()
|
10 |
+
|
11 |
+
# Load precomputed embeddings
|
12 |
+
news_embeddings = np.load("news_embeddings.npy")
|
13 |
+
|
14 |
+
# Normalize embeddings
|
15 |
+
normalized_embeddings = normalize(news_embeddings)
|
16 |
+
|
17 |
+
# Apply UMAP for dimensionality reduction
|
18 |
+
reducer = umap.UMAP(n_components=10, random_state=42)
|
19 |
+
umap_embeddings = reducer.fit_transform(normalized_embeddings)
|
20 |
+
|
21 |
+
# List to hold silhouette scores
|
22 |
+
silhouette_scores = []
|
23 |
+
cluster_range = range(2, 20) # You can adjust the range based on your needs
|
24 |
+
|
25 |
+
# Iterate through different numbers of clusters
|
26 |
+
for num_clusters in cluster_range:
|
27 |
+
# Perform Agglomerative Clustering
|
28 |
+
clustering_model = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
|
29 |
+
cluster_labels = clustering_model.fit_predict(umap_embeddings)
|
30 |
+
|
31 |
+
# Calculate silhouette score
|
32 |
+
silhouette_avg = silhouette_score(umap_embeddings, cluster_labels)
|
33 |
+
silhouette_scores.append(silhouette_avg)
|
34 |
+
print(f"Number of Clusters: {num_clusters}, Silhouette Score: {silhouette_avg:.4f}")
|
35 |
+
|
36 |
+
# Plot silhouette scores for each number of clusters
|
37 |
+
plt.figure(figsize=(10, 6))
|
38 |
+
plt.plot(cluster_range, silhouette_scores, marker='o')
|
39 |
+
plt.title('Silhouette Scores for Different Numbers of Clusters (With UMAP)')
|
40 |
+
plt.xlabel('Number of Clusters')
|
41 |
+
plt.ylabel('Silhouette Score')
|
42 |
+
plt.xticks(cluster_range) # Show all tick marks
|
43 |
+
plt.grid()
|
44 |
+
plt.show()
|
45 |
+
|
46 |
+
# Determine the optimal number of clusters
|
47 |
+
optimal_clusters = cluster_range[np.argmax(silhouette_scores)]
|
48 |
+
print(f"The optimal number of clusters is: {optimal_clusters}")
|
49 |
+
|
50 |
+
|
51 |
+
# Load precomputed embeddings
|
52 |
+
news_embeddings = np.load("news_embeddings.npy")
|
53 |
+
|
54 |
+
# Normalize embeddings
|
55 |
+
normalized_embeddings = normalize(news_embeddings)
|
56 |
+
|
57 |
+
# Define the number of clusters
|
58 |
+
num_clusters = 7
|
59 |
+
|
60 |
+
# Perform Agglomerative Clustering without UMAP
|
61 |
+
clustering_model_no_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
|
62 |
+
cluster_labels_no_umap = clustering_model_no_umap.fit_predict(normalized_embeddings)
|
63 |
+
silhouette_no_umap = silhouette_score(normalized_embeddings, cluster_labels_no_umap)
|
64 |
+
print(f"Silhouette Score without UMAP: {silhouette_no_umap}")
|
65 |
+
|
66 |
+
# Visualization of clustering results (using first two dimensions for a simple scatter plot)
|
67 |
+
plt.figure(figsize=(10, 7))
|
68 |
+
plt.scatter(normalized_embeddings[:, 0], normalized_embeddings[:, 1], c=cluster_labels_no_umap, cmap='viridis', alpha=0.6)
|
69 |
+
plt.title(f'Agglomerative Clustering Visualization (Silhouette Score: {silhouette_no_umap:.4f})')
|
70 |
+
plt.xlabel('Dimension 1')
|
71 |
+
plt.ylabel('Dimension 2')
|
72 |
+
plt.colorbar(label='Cluster Label')
|
73 |
+
plt.show()
|
74 |
+
|
75 |
+
# Perform UMAP and visualize
|
76 |
+
import umap
|
77 |
+
|
78 |
+
# Reduce dimensionality to 10 with UMAP (this step is optional for this part)
|
79 |
+
reducer = umap.UMAP(n_components=10)
|
80 |
+
reduced_embeddings = reducer.fit_transform(normalized_embeddings)
|
81 |
+
|
82 |
+
# Perform clustering on reduced embeddings
|
83 |
+
clustering_model_with_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
|
84 |
+
cluster_labels_with_umap = clustering_model_with_umap.fit_predict(reduced_embeddings)
|
85 |
+
silhouette_with_umap = silhouette_score(reduced_embeddings, cluster_labels_with_umap)
|
86 |
+
print(f"Silhouette Score after UMAP: {silhouette_with_umap}")
|
87 |
+
|
88 |
+
# Scatter plot of UMAP reduced embeddings
|
89 |
+
plt.figure(figsize=(10, 7))
|
90 |
+
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels_with_umap, cmap='viridis', alpha=0.6)
|
91 |
+
plt.title(f'Agglomerative Clustering Visualization with UMAP (Silhouette Score: {silhouette_with_umap:.4f})')
|
92 |
+
plt.xlabel('UMAP Component 1')
|
93 |
+
plt.ylabel('UMAP Component 2')
|
94 |
+
plt.colorbar(label='Cluster Label')
|
95 |
+
plt.show()
|
96 |
+
|
97 |
+
# Compare scores
|
98 |
+
if silhouette_with_umap > silhouette_no_umap:
|
99 |
+
print(f"UMAP improved the silhouette score by {silhouette_with_umap - silhouette_no_umap}")
|
100 |
+
else:
|
101 |
+
print(f"UMAP did not improve the silhouette score. Difference: {silhouette_no_umap - silhouette_with_umap}")
|