Spaces:
Sleeping
Sleeping
File size: 1,441 Bytes
ca3e099 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 |
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
#from _config import config
class PCAHandler(BaseEstimator, TransformerMixin):
def __init__(self, apply_pca=False, variance=0.95):
self.apply_pca = apply_pca
self.variance = variance
self.pca = None
self.num_cols = None # To store the indices of numerical columns
def fit(self, X, y=None):
if self.apply_pca:
# Select only numerical columns and exclude 'groupid'
self.num_cols = X.select_dtypes(include='number').columns
self.num_cols = self.num_cols.difference(['groupid']) # Exclude 'groupid'
# Apply PCA only to the remaining numerical columns
self.pca = PCA(n_components=self.variance)
self.pca.fit(X[self.num_cols])
return self
def transform(self, X):
if self.apply_pca and self.pca:
# Transform only the numerical columns excluding 'groupid'
X_transformed = self.pca.transform(X[self.num_cols])
# Create a DataFrame for the PCA-transformed columns
X_pca = pd.DataFrame(X_transformed, index=X.index, columns=[f'PCA_{i+1}' for i in range(X_transformed.shape[1])])
# Combine non-PCA columns with PCA-transformed ones
X = X.drop(columns=self.num_cols).join(X_pca)
return X
|