wu981526092's picture
update
760a88c
import pandas as pd
from sklearn.decomposition import PCA
import plotly.express as px
def perform_pca(data, n_components):
pca = PCA(n_components=n_components)
principalComponents = pca.fit_transform(data)
principalDf = pd.DataFrame(data=principalComponents,
columns=[f'principal component {i+1}' for i in range(n_components)])
return principalDf
def plot_pca(clustered_data,principalDf,df,information_columns):
clustered_data = clustered_data.reset_index() # To make sure indices match with df
finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
finalDf.drop(columns=['index'], inplace=True)
fig = px.scatter(finalDf,
x='principal component 1',
y='principal component 2',
color='cluster',
hover_data=information_columns,
title='2 Component PCA',
labels={'principal component 1':'Principal Component 1', 'principal component 2':'Principal Component 2'},
color_continuous_scale='viridis')
# fig.show()
return fig
# def get_common_features(data):
# common_features = {}
# for cluster in data['cluster'].unique():
# cluster_data = data[data['cluster'] == cluster]
# cluster_features_counts = {}
# for column in cluster_data.drop(columns=["cluster"]).columns:
# top_feature = cluster_data[column].mode()[0] # Use mode to find the most common category
# if top_feature != 0: # Add only if most common feature is not 0
# cluster_features_counts[column] = top_feature
# common_features[cluster] = cluster_features_counts
# return common_features
def plot_pca_3D(clustered_data, principalDf,df,information_columns):
clustered_data = clustered_data.reset_index() # To make sure indices match with df
finalDf = df.merge(clustered_data[['index', 'cluster']], left_index=True, right_on='index')
finalDf = finalDf.merge(principalDf, left_index=True, right_index=True)
finalDf.drop(columns=['index'], inplace=True)
# common_features = get_common_features(clustered_data)
# # Add most common features to finalDf
# for cluster, features in common_features.items():
# for feature, value in features.items():
# finalDf.loc[finalDf['cluster'] == cluster, f"Most common {feature}"] = value
hover_data = information_columns #+ [col for col in finalDf.columns if "Most common" in col]
fig = px.scatter_3d(finalDf,
x='principal component 1',
y='principal component 2',
z='principal component 3',
color='cluster',
hover_data=hover_data,
title='3 Component PCA',
labels={f'principal component {i+1}': f'Principal Component {i+1}' for i in range(3)},
color_continuous_scale='viridis')
# fig.show()
return fig