# utils/preprocessor.py from sklearn.preprocessing import StandardScaler, LabelEncoder import pandas as pd import numpy as np import logging from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DataPreprocessor: def __init__(self): self.scaler = StandardScaler() self.label_encoders = {} def preprocess(self, df): """ Preprocess DataFrame: handle missing values, encode categorical variables, scale numerical variables. """ try: logger.info(f"Input DataFrame shape: {df.shape}, columns: {list(df.columns)}") df_processed = df.copy() # Handle missing values logger.info("Handling missing values...") for col in df_processed.columns: if df_processed[col].isnull().any(): if pd.api.types.is_numeric_dtype(df_processed[col]): df_processed[col] = df_processed[col].fillna(df_processed[col].mean()) logger.info(f"Filled numeric missing values in '{col}' with mean.") else: df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0]) logger.info(f"Filled categorical missing values in '{col}' with mode.") # Encode categorical variables logger.info("Encoding categorical variables...") for col in df_processed.select_dtypes(include=['object', 'category']).columns: logger.info(f"Encoding column: {col}") self.label_encoders[col] = LabelEncoder() df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col]) # Scale numerical variables logger.info("Scaling numerical variables...") numeric_cols = df_processed.select_dtypes(include=[np.number]).columns if len(numeric_cols) > 0: # Exclude columns that are now effectively categorical (post-label encoding) # This is a heuristic; ideally, identify original numeric columns. cols_to_scale = [col for col in numeric_cols if col not in self.label_encoders] if cols_to_scale: df_processed[cols_to_scale] = self.scaler.fit_transform(df_processed[cols_to_scale]) logger.info(f"Scaled numeric columns: {cols_to_scale}") logger.info(f"Preprocessed DataFrame shape: {df_processed.shape}") return df_processed except Exception as e: logger.exception(f"Error preprocessing data: {str(e)}") raise def summarize_dataframe_for_chatbot(data_list): """ Generates a test summary of the DataFrame for chatbot interaction.""" if not data_list: return "No data loaded." df = pd.DataFrame(data_list) nums_rows, num_cols = df.shape col_info = [] for col in df.columns: dtype = df[col].dtype unique_vals = df[col].nunique() missing_count = df[col].isnull().sum() info = f"-{col} (Type:{dtype}" if pd.api.types.is_numeric_dtype(df[col]): info +=f", Min:{df[col].min():.2f}, Max:{df[col].max():.2f}" else: info += f", Unique:{unique_vals}" if missing_count > 0: info += f", Missing:{missing_count}" info += ")" col_info.append(info) summary = (f"Dataset Summary:\n- Rows: {nums_rows}, Columns: {num_cols}\nColumns:\n" + "\n".join(col_info)) return summary