from matplotlib import pyplot as plt from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score def calculate_wcss(data): wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) kmeans.fit(data) wcss.append(kmeans.inertia_) return wcss def calculate_silhouette_scores(data): scores = [] range_values = range(2, 11) for i in range_values: kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0) kmeans.fit(data) score = silhouette_score(data, kmeans.labels_, metric='euclidean') scores.append(score) return scores def plot_elbow(wcss): plt.plot(range(1, 11), wcss) plt.title('Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() def get_optimal_clusters_silhouette(scores): optimal_clusters = scores.index(max(scores)) + 2 # +2 because range_values starts from 2 print(f"Optimal number of clusters: {optimal_clusters}") return optimal_clusters def fit_kmeans(data, n_clusters): kmeans = KMeans(n_clusters=n_clusters, random_state=0) clusters = kmeans.fit_predict(data) data['cluster'] = clusters return kmeans, data