csycsycsy's picture
Upload 10 files
84db192 verified
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import os
import joblib
script_path = os.path.abspath(__file__)
script_dir = os.path.dirname(script_path)
os.chdir(script_dir)
file_path = 'TCGA-LGG.methylation450.tsv'
df = pd.read_csv(file_path, sep='\t', index_col=0)
df.dropna(inplace=True)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.T)
pca = PCA(n_components=50)
principal_components = pca.fit_transform(scaled_data)
pca_model_path = 'pca_model.pkl'
joblib.dump(pca, pca_model_path)
print(f"PCA模型已保存为 {pca_model_path}")
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=df.index)
loadings.to_csv('pca_loadings.csv')
print("主成分载荷矩阵已保存为 pca_loadings.csv")
sample_ids = df.columns
principal_df = pd.DataFrame(data=principal_components, columns=[f'Principal Component {i+1}' for i in range(50)], index=sample_ids)
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(principal_df['Principal Component 1'], principal_df['Principal Component 2'], principal_df['Principal Component 3'])
for i, sample_id in enumerate(sample_ids):
ax.text(principal_df['Principal Component 1'][i], principal_df['Principal Component 2'][i], principal_df['Principal Component 3'][i], sample_id)
ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.set_title('3D PCA of Methylation Data')
plt.show()
output_file_path = 'pca_principal_components.csv'
principal_df.to_csv(output_file_path)
explained_variance = pca.explained_variance_ratio_
print(f"Explained variance by each component: {explained_variance}")
print(f"50个主成分已保存为 {output_file_path}")