import pandas as pd from sklearn.decomposition import PCA import plotly.express as px def perform_pca(data, n_components): pca = PCA(n_components=n_components) principalComponents = pca.fit_transform(data) principalDf = pd.DataFrame(data=principalComponents, columns=[f'principal component {i+1}' for i in range(n_components)]) return principalDf def plot_pca(clustered_data,principalDf,df,information_columns): clustered_data = clustered_data.reset_index() # To make sure indices match with df finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index') finalDf = finalDf.merge(principalDf, left_index=True, right_index=True) finalDf.drop(columns=['index'], inplace=True) fig = px.scatter(finalDf, x='principal component 1', y='principal component 2', color='cluster', hover_data=information_columns, title='2 Component PCA', labels={'principal component 1':'Principal Component 1', 'principal component 2':'Principal Component 2'}, color_continuous_scale='viridis') # fig.show() return fig # def get_common_features(data): # common_features = {} # for cluster in data['cluster'].unique(): # cluster_data = data[data['cluster'] == cluster] # cluster_features_counts = {} # for column in cluster_data.drop(columns=["cluster"]).columns: # top_feature = cluster_data[column].mode()[0] # Use mode to find the most common category # if top_feature != 0: # Add only if most common feature is not 0 # cluster_features_counts[column] = top_feature # common_features[cluster] = cluster_features_counts # return common_features def plot_pca_3D(clustered_data, principalDf,df,information_columns): clustered_data = clustered_data.reset_index() # To make sure indices match with df finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index') finalDf = finalDf.merge(principalDf, left_index=True, right_index=True) finalDf.drop(columns=['index'], inplace=True) # common_features = get_common_features(clustered_data) # # Add most common features to finalDf # for cluster, features in common_features.items(): # for feature, value in features.items(): # finalDf.loc[finalDf['cluster'] == cluster, f"Most common {feature}"] = value hover_data = information_columns #+ [col for col in finalDf.columns if "Most common" in col] fig = px.scatter_3d(finalDf, x='principal component 1', y='principal component 2', z='principal component 3', color='cluster', hover_data=hover_data, title='3 Component PCA', labels={f'principal component {i+1}': f'Principal Component {i+1}' for i in range(3)}, color_continuous_scale='viridis') # fig.show() return fig