Spaces:

Sru15
/

Marathi-Semantic-Search

Sleeping

App Files Files Community

Sru15 commited on Oct 26

Commit

ab43a3b

•

1 Parent(s): 11b5a6f

Upload performance_evaluation.py

Browse files

Files changed (1) hide show

performance_evaluation.py +101 -0

performance_evaluation.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.metrics import silhouette_score
+from sklearn.preprocessing import normalize
+import umap
+synonyms = pd.read_csv('Synonyms.csv')
+synonyms.head()
+# Load precomputed embeddings
+news_embeddings = np.load("news_embeddings.npy")
+# Normalize embeddings
+normalized_embeddings = normalize(news_embeddings)
+# Apply UMAP for dimensionality reduction
+reducer = umap.UMAP(n_components=10, random_state=42)
+umap_embeddings = reducer.fit_transform(normalized_embeddings)
+# List to hold silhouette scores
+silhouette_scores = []
+cluster_range = range(2, 20)  # You can adjust the range based on your needs
+# Iterate through different numbers of clusters
+for num_clusters in cluster_range:
+    # Perform Agglomerative Clustering
+    clustering_model = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
+    cluster_labels = clustering_model.fit_predict(umap_embeddings)
+    # Calculate silhouette score
+    silhouette_avg = silhouette_score(umap_embeddings, cluster_labels)
+    silhouette_scores.append(silhouette_avg)
+    print(f"Number of Clusters: {num_clusters}, Silhouette Score: {silhouette_avg:.4f}")
+# Plot silhouette scores for each number of clusters
+plt.figure(figsize=(10, 6))
+plt.plot(cluster_range, silhouette_scores, marker='o')
+plt.title('Silhouette Scores for Different Numbers of Clusters (With UMAP)')
+plt.xlabel('Number of Clusters')
+plt.ylabel('Silhouette Score')
+plt.xticks(cluster_range)  # Show all tick marks
+plt.grid()
+plt.show()
+# Determine the optimal number of clusters
+optimal_clusters = cluster_range[np.argmax(silhouette_scores)]
+print(f"The optimal number of clusters is: {optimal_clusters}")
+# Load precomputed embeddings
+news_embeddings = np.load("news_embeddings.npy")
+# Normalize embeddings
+normalized_embeddings = normalize(news_embeddings)
+# Define the number of clusters
+num_clusters = 7
+# Perform Agglomerative Clustering without UMAP
+clustering_model_no_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
+cluster_labels_no_umap = clustering_model_no_umap.fit_predict(normalized_embeddings)
+silhouette_no_umap = silhouette_score(normalized_embeddings, cluster_labels_no_umap)
+print(f"Silhouette Score without UMAP: {silhouette_no_umap}")
+# Visualization of clustering results (using first two dimensions for a simple scatter plot)
+plt.figure(figsize=(10, 7))
+plt.scatter(normalized_embeddings[:, 0], normalized_embeddings[:, 1], c=cluster_labels_no_umap, cmap='viridis', alpha=0.6)
+plt.title(f'Agglomerative Clustering Visualization (Silhouette Score: {silhouette_no_umap:.4f})')
+plt.xlabel('Dimension 1')
+plt.ylabel('Dimension 2')
+plt.colorbar(label='Cluster Label')
+plt.show()
+# Perform UMAP and visualize
+import umap
+# Reduce dimensionality to 10 with UMAP (this step is optional for this part)
+reducer = umap.UMAP(n_components=10)
+reduced_embeddings = reducer.fit_transform(normalized_embeddings)
+# Perform clustering on reduced embeddings
+clustering_model_with_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward')
+cluster_labels_with_umap = clustering_model_with_umap.fit_predict(reduced_embeddings)
+silhouette_with_umap = silhouette_score(reduced_embeddings, cluster_labels_with_umap)
+print(f"Silhouette Score after UMAP: {silhouette_with_umap}")
+# Scatter plot of UMAP reduced embeddings
+plt.figure(figsize=(10, 7))
+plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels_with_umap, cmap='viridis', alpha=0.6)
+plt.title(f'Agglomerative Clustering Visualization with UMAP (Silhouette Score: {silhouette_with_umap:.4f})')
+plt.xlabel('UMAP Component 1')
+plt.ylabel('UMAP Component 2')
+plt.colorbar(label='Cluster Label')
+plt.show()
+# Compare scores
+if silhouette_with_umap > silhouette_no_umap:
+    print(f"UMAP improved the silhouette score by {silhouette_with_umap - silhouette_no_umap}")
+else:
+    print(f"UMAP did not improve the silhouette score. Difference: {silhouette_no_umap - silhouette_with_umap}")