Sru15 commited on
Commit
ab43a3b
1 Parent(s): 11b5a6f

Upload performance_evaluation.py

Browse files
Files changed (1) hide show
  1. performance_evaluation.py +101 -0
performance_evaluation.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ from sklearn.cluster import AgglomerativeClustering
4
+ from sklearn.metrics import silhouette_score
5
+ from sklearn.preprocessing import normalize
6
+ import umap
7
+
8
+ synonyms = pd.read_csv('Synonyms.csv')
9
+ synonyms.head()
10
+
11
+ # Load precomputed embeddings
12
+ news_embeddings = np.load("news_embeddings.npy")
13
+
14
+ # Normalize embeddings
15
+ normalized_embeddings = normalize(news_embeddings)
16
+
17
+ # Apply UMAP for dimensionality reduction
18
+ reducer = umap.UMAP(n_components=10, random_state=42)
19
+ umap_embeddings = reducer.fit_transform(normalized_embeddings)
20
+
21
+ # List to hold silhouette scores
22
+ silhouette_scores = []
23
+ cluster_range = range(2, 20) # You can adjust the range based on your needs
24
+
25
+ # Iterate through different numbers of clusters
26
+ for num_clusters in cluster_range:
27
+ # Perform Agglomerative Clustering
28
+ clustering_model = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
29
+ cluster_labels = clustering_model.fit_predict(umap_embeddings)
30
+
31
+ # Calculate silhouette score
32
+ silhouette_avg = silhouette_score(umap_embeddings, cluster_labels)
33
+ silhouette_scores.append(silhouette_avg)
34
+ print(f"Number of Clusters: {num_clusters}, Silhouette Score: {silhouette_avg:.4f}")
35
+
36
+ # Plot silhouette scores for each number of clusters
37
+ plt.figure(figsize=(10, 6))
38
+ plt.plot(cluster_range, silhouette_scores, marker='o')
39
+ plt.title('Silhouette Scores for Different Numbers of Clusters (With UMAP)')
40
+ plt.xlabel('Number of Clusters')
41
+ plt.ylabel('Silhouette Score')
42
+ plt.xticks(cluster_range) # Show all tick marks
43
+ plt.grid()
44
+ plt.show()
45
+
46
+ # Determine the optimal number of clusters
47
+ optimal_clusters = cluster_range[np.argmax(silhouette_scores)]
48
+ print(f"The optimal number of clusters is: {optimal_clusters}")
49
+
50
+
51
+ # Load precomputed embeddings
52
+ news_embeddings = np.load("news_embeddings.npy")
53
+
54
+ # Normalize embeddings
55
+ normalized_embeddings = normalize(news_embeddings)
56
+
57
+ # Define the number of clusters
58
+ num_clusters = 7
59
+
60
+ # Perform Agglomerative Clustering without UMAP
61
+ clustering_model_no_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
62
+ cluster_labels_no_umap = clustering_model_no_umap.fit_predict(normalized_embeddings)
63
+ silhouette_no_umap = silhouette_score(normalized_embeddings, cluster_labels_no_umap)
64
+ print(f"Silhouette Score without UMAP: {silhouette_no_umap}")
65
+
66
+ # Visualization of clustering results (using first two dimensions for a simple scatter plot)
67
+ plt.figure(figsize=(10, 7))
68
+ plt.scatter(normalized_embeddings[:, 0], normalized_embeddings[:, 1], c=cluster_labels_no_umap, cmap='viridis', alpha=0.6)
69
+ plt.title(f'Agglomerative Clustering Visualization (Silhouette Score: {silhouette_no_umap:.4f})')
70
+ plt.xlabel('Dimension 1')
71
+ plt.ylabel('Dimension 2')
72
+ plt.colorbar(label='Cluster Label')
73
+ plt.show()
74
+
75
+ # Perform UMAP and visualize
76
+ import umap
77
+
78
+ # Reduce dimensionality to 10 with UMAP (this step is optional for this part)
79
+ reducer = umap.UMAP(n_components=10)
80
+ reduced_embeddings = reducer.fit_transform(normalized_embeddings)
81
+
82
+ # Perform clustering on reduced embeddings
83
+ clustering_model_with_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
84
+ cluster_labels_with_umap = clustering_model_with_umap.fit_predict(reduced_embeddings)
85
+ silhouette_with_umap = silhouette_score(reduced_embeddings, cluster_labels_with_umap)
86
+ print(f"Silhouette Score after UMAP: {silhouette_with_umap}")
87
+
88
+ # Scatter plot of UMAP reduced embeddings
89
+ plt.figure(figsize=(10, 7))
90
+ plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels_with_umap, cmap='viridis', alpha=0.6)
91
+ plt.title(f'Agglomerative Clustering Visualization with UMAP (Silhouette Score: {silhouette_with_umap:.4f})')
92
+ plt.xlabel('UMAP Component 1')
93
+ plt.ylabel('UMAP Component 2')
94
+ plt.colorbar(label='Cluster Label')
95
+ plt.show()
96
+
97
+ # Compare scores
98
+ if silhouette_with_umap > silhouette_no_umap:
99
+ print(f"UMAP improved the silhouette score by {silhouette_with_umap - silhouette_no_umap}")
100
+ else:
101
+ print(f"UMAP did not improve the silhouette score. Difference: {silhouette_no_umap - silhouette_with_umap}")