import pandas as pd from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import os import joblib script_path = os.path.abspath(__file__) script_dir = os.path.dirname(script_path) os.chdir(script_dir) file_path = 'TCGA-LGG.methylation450.tsv' df = pd.read_csv(file_path, sep='\t', index_col=0) df.dropna(inplace=True) scaler = StandardScaler() scaled_data = scaler.fit_transform(df.T) pca = PCA(n_components=50) principal_components = pca.fit_transform(scaled_data) pca_model_path = 'pca_model.pkl' joblib.dump(pca, pca_model_path) print(f"PCA模型已保存为 {pca_model_path}") loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=df.index) loadings.to_csv('pca_loadings.csv') print("主成分载荷矩阵已保存为 pca_loadings.csv") sample_ids = df.columns principal_df = pd.DataFrame(data=principal_components, columns=[f'Principal Component {i+1}' for i in range(50)], index=sample_ids) fig = plt.figure(figsize=(10, 8)) ax = fig.add_subplot(111, projection='3d') ax.scatter(principal_df['Principal Component 1'], principal_df['Principal Component 2'], principal_df['Principal Component 3']) for i, sample_id in enumerate(sample_ids): ax.text(principal_df['Principal Component 1'][i], principal_df['Principal Component 2'][i], principal_df['Principal Component 3'][i], sample_id) ax.set_xlabel('Principal Component 1') ax.set_ylabel('Principal Component 2') ax.set_zlabel('Principal Component 3') ax.set_title('3D PCA of Methylation Data') plt.show() output_file_path = 'pca_principal_components.csv' principal_df.to_csv(output_file_path) explained_variance = pca.explained_variance_ratio_ print(f"Explained variance by each component: {explained_variance}") print(f"50个主成分已保存为 {output_file_path}")