Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from sklearn.cluster import AgglomerativeClustering | |
| from sklearn.metrics import silhouette_score | |
| from sklearn.preprocessing import normalize | |
| import umap | |
| synonyms = pd.read_csv('Synonyms.csv') | |
| synonyms.head() | |
| # Load precomputed embeddings | |
| news_embeddings = np.load("news_embeddings.npy") | |
| # Normalize embeddings | |
| normalized_embeddings = normalize(news_embeddings) | |
| # Apply UMAP for dimensionality reduction | |
| reducer = umap.UMAP(n_components=10, random_state=42) | |
| umap_embeddings = reducer.fit_transform(normalized_embeddings) | |
| # List to hold silhouette scores | |
| silhouette_scores = [] | |
| cluster_range = range(2, 20) # You can adjust the range based on your needs | |
| # Iterate through different numbers of clusters | |
| for num_clusters in cluster_range: | |
| # Perform Agglomerative Clustering | |
| clustering_model = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward') | |
| cluster_labels = clustering_model.fit_predict(umap_embeddings) | |
| # Calculate silhouette score | |
| silhouette_avg = silhouette_score(umap_embeddings, cluster_labels) | |
| silhouette_scores.append(silhouette_avg) | |
| print(f"Number of Clusters: {num_clusters}, Silhouette Score: {silhouette_avg:.4f}") | |
| # Plot silhouette scores for each number of clusters | |
| plt.figure(figsize=(10, 6)) | |
| plt.plot(cluster_range, silhouette_scores, marker='o') | |
| plt.title('Silhouette Scores for Different Numbers of Clusters (With UMAP)') | |
| plt.xlabel('Number of Clusters') | |
| plt.ylabel('Silhouette Score') | |
| plt.xticks(cluster_range) # Show all tick marks | |
| plt.grid() | |
| plt.show() | |
| # Determine the optimal number of clusters | |
| optimal_clusters = cluster_range[np.argmax(silhouette_scores)] | |
| print(f"The optimal number of clusters is: {optimal_clusters}") | |
| # Load precomputed embeddings | |
| news_embeddings = np.load("news_embeddings.npy") | |
| # Normalize embeddings | |
| normalized_embeddings = normalize(news_embeddings) | |
| # Define the number of clusters | |
| num_clusters = 7 | |
| # Perform Agglomerative Clustering without UMAP | |
| clustering_model_no_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward') | |
| cluster_labels_no_umap = clustering_model_no_umap.fit_predict(normalized_embeddings) | |
| silhouette_no_umap = silhouette_score(normalized_embeddings, cluster_labels_no_umap) | |
| print(f"Silhouette Score without UMAP: {silhouette_no_umap}") | |
| # Visualization of clustering results (using first two dimensions for a simple scatter plot) | |
| plt.figure(figsize=(10, 7)) | |
| plt.scatter(normalized_embeddings[:, 0], normalized_embeddings[:, 1], c=cluster_labels_no_umap, cmap='viridis', alpha=0.6) | |
| plt.title(f'Agglomerative Clustering Visualization (Silhouette Score: {silhouette_no_umap:.4f})') | |
| plt.xlabel('Dimension 1') | |
| plt.ylabel('Dimension 2') | |
| plt.colorbar(label='Cluster Label') | |
| plt.show() | |
| # Perform UMAP and visualize | |
| import umap | |
| # Reduce dimensionality to 10 with UMAP (this step is optional for this part) | |
| reducer = umap.UMAP(n_components=10) | |
| reduced_embeddings = reducer.fit_transform(normalized_embeddings) | |
| # Perform clustering on reduced embeddings | |
| clustering_model_with_umap = AgglomerativeClustering(n_clusters=num_clusters, linkage='ward') | |
| cluster_labels_with_umap = clustering_model_with_umap.fit_predict(reduced_embeddings) | |
| silhouette_with_umap = silhouette_score(reduced_embeddings, cluster_labels_with_umap) | |
| print(f"Silhouette Score after UMAP: {silhouette_with_umap}") | |
| # Scatter plot of UMAP reduced embeddings | |
| plt.figure(figsize=(10, 7)) | |
| plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=cluster_labels_with_umap, cmap='viridis', alpha=0.6) | |
| plt.title(f'Agglomerative Clustering Visualization with UMAP (Silhouette Score: {silhouette_with_umap:.4f})') | |
| plt.xlabel('UMAP Component 1') | |
| plt.ylabel('UMAP Component 2') | |
| plt.colorbar(label='Cluster Label') | |
| plt.show() | |
| # Compare scores | |
| if silhouette_with_umap > silhouette_no_umap: | |
| print(f"UMAP improved the silhouette score by {silhouette_with_umap - silhouette_no_umap}") | |
| else: | |
| print(f"UMAP did not improve the silhouette score. Difference: {silhouette_no_umap - silhouette_with_umap}") | |