Wilson-ZheLin
Initial commit
9183c57
raw
history blame contribute delete
No virus
5.1 kB
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from src.preprocess import convert_to_integer
def decide_pca(df, cumulative_variance_threshold=0.95, min_dim_reduction_ratio=0.1):
"""
Determines whether PCA should be performed based on cumulative variance threshold and dimension reduction ratio.
Parameters:
- df (DataFrame): The input DataFrame.
- cumulative_variance_threshold (float): The threshold of explained variance to retain. Default is 0.95.
- min_dim_reduction_ratio (float): The minimum ratio of dimension reduction required to perform PCA. Default is 0.1.
Returns:
- perform_pca (bool): Whether PCA should be performed.
- n_components (int): The number of principal components to retain.
"""
# Remove non-numeric columns
numeric_df = df.select_dtypes(include=[np.number])
# Standardizing the Data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# PCA for Explained Variance
pca = PCA()
pca.fit(scaled_data)
# Calculate cumulative variance
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
# Find the number of components for the desired threshold
n_components = np.where(cumulative_variance >= cumulative_variance_threshold)[0][0] + 1
# Calculate the dimension reduction ratio
dim_reduction_ratio = 1 - (n_components / df.shape[1])
# Check if PCA should be performed based on the dimension reduction ratio
perform_pca = dim_reduction_ratio >= min_dim_reduction_ratio
return perform_pca, n_components
def perform_pca(df, n_components, Y_name):
"""
Performs PCA on the dataset, optionally excluding a target column, and standardizes the data.
Parameters:
- df (DataFrame): The input DataFrame.
- n_components (int): The number of principal components to retain.
- Y_name (str, optional): The name of the target column to exclude from PCA. Default is None.
Returns:
- pca_df (DataFrame): DataFrame with principal components and optionally the target column.
"""
# Save the target column data
drop_columns = []
if Y_name:
target_data = df[Y_name]
drop_columns.append(Y_name)
# Remove non-numeric columns and the target column
numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')
# Standardizing the Data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)
# Applying PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(scaled_data)
# Create a new DataFrame with principal components
columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=principal_components, columns=columns)
# Reattach the target column
if Y_name:
pca_df[Y_name] = target_data.reset_index(drop=True)
pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
return pca_df
def perform_PCA_for_clustering(df, n_components):
"""
Applies PCA transformation for clustering tasks on the given DataFrame.
Parameters:
- df (DataFrame): The input DataFrame to apply PCA.
- n_components (int): The number of principal components to retain.
Returns:
- pca_df (DataFrame): DataFrame of the principal components.
"""
# Applying PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(df)
# Create a new DataFrame with principal components
columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=principal_components, columns=columns)
return pca_df
def perform_PCA_for_regression(df, n_components, Y_name):
"""
Applies PCA for regression tasks, excluding a specified target column from the transformation.
Parameters:
- df (DataFrame): The input DataFrame.
- n_components (int): The number of principal components to retain.
- Y_name (str, optional): The name of the target column to exclude from PCA and append back after transformation. Default is None.
Returns:
- pca_df (DataFrame): A new DataFrame with principal components and the target column.
"""
# Save the target column data
drop_columns = []
if Y_name:
target_data = df[Y_name]
drop_columns.append(Y_name)
# Remove non-numeric columns and the target column
numeric_df = df.select_dtypes(include=[np.number]).drop(columns=drop_columns, errors='ignore')
# Applying PCA
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(numeric_df)
# Create a new DataFrame with principal components
columns = [f'PC{i+1}' for i in range(n_components)]
pca_df = pd.DataFrame(data=principal_components, columns=columns)
# Reattach the target column
if Y_name:
pca_df[Y_name] = target_data.reset_index(drop=True)
pca_df, _ = convert_to_integer(pca_df, columns_to_convert=[Y_name])
return pca_df