CausalBox / utils /preprocessor.py
ShutterStack's picture
major changes
ab66d4e verified
# utils/preprocessor.py
from sklearn.preprocessing import StandardScaler, LabelEncoder
import pandas as pd
import numpy as np
import logging
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DataPreprocessor:
def __init__(self):
self.scaler = StandardScaler()
self.label_encoders = {}
def preprocess(self, df):
"""
Preprocess DataFrame: handle missing values, encode categorical variables, scale numerical variables.
"""
try:
logger.info(f"Input DataFrame shape: {df.shape}, columns: {list(df.columns)}")
df_processed = df.copy()
# Handle missing values
logger.info("Handling missing values...")
for col in df_processed.columns:
if df_processed[col].isnull().any():
if pd.api.types.is_numeric_dtype(df_processed[col]):
df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
logger.info(f"Filled numeric missing values in '{col}' with mean.")
else:
df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
logger.info(f"Filled categorical missing values in '{col}' with mode.")
# Encode categorical variables
logger.info("Encoding categorical variables...")
for col in df_processed.select_dtypes(include=['object', 'category']).columns:
logger.info(f"Encoding column: {col}")
self.label_encoders[col] = LabelEncoder()
df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col])
# Scale numerical variables
logger.info("Scaling numerical variables...")
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
# Exclude columns that are now effectively categorical (post-label encoding)
# This is a heuristic; ideally, identify original numeric columns.
cols_to_scale = [col for col in numeric_cols if col not in self.label_encoders]
if cols_to_scale:
df_processed[cols_to_scale] = self.scaler.fit_transform(df_processed[cols_to_scale])
logger.info(f"Scaled numeric columns: {cols_to_scale}")
logger.info(f"Preprocessed DataFrame shape: {df_processed.shape}")
return df_processed
except Exception as e:
logger.exception(f"Error preprocessing data: {str(e)}")
raise
def summarize_dataframe_for_chatbot(data_list):
"""
Generates a test summary of the DataFrame for chatbot interaction."""
if not data_list:
return "No data loaded."
df = pd.DataFrame(data_list)
nums_rows, num_cols = df.shape
col_info = []
for col in df.columns:
dtype = df[col].dtype
unique_vals = df[col].nunique()
missing_count = df[col].isnull().sum()
info = f"-{col} (Type:{dtype}"
if pd.api.types.is_numeric_dtype(df[col]):
info +=f", Min:{df[col].min():.2f}, Max:{df[col].max():.2f}"
else:
info += f", Unique:{unique_vals}"
if missing_count > 0:
info += f", Missing:{missing_count}"
info += ")"
col_info.append(info)
summary = (f"Dataset Summary:\n- Rows: {nums_rows}, Columns: {num_cols}\nColumns:\n" + "\n".join(col_info))
return summary