Spaces:
Sleeping
Sleeping
# utils/preprocessor.py | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
import pandas as pd | |
import numpy as np | |
import logging | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
from sklearn.impute import SimpleImputer | |
from sklearn.compose import ColumnTransformer | |
from sklearn.pipeline import Pipeline | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class DataPreprocessor: | |
def __init__(self): | |
self.scaler = StandardScaler() | |
self.label_encoders = {} | |
def preprocess(self, df): | |
""" | |
Preprocess DataFrame: handle missing values, encode categorical variables, scale numerical variables. | |
""" | |
try: | |
logger.info(f"Input DataFrame shape: {df.shape}, columns: {list(df.columns)}") | |
df_processed = df.copy() | |
# Handle missing values | |
logger.info("Handling missing values...") | |
for col in df_processed.columns: | |
if df_processed[col].isnull().any(): | |
if pd.api.types.is_numeric_dtype(df_processed[col]): | |
df_processed[col] = df_processed[col].fillna(df_processed[col].mean()) | |
logger.info(f"Filled numeric missing values in '{col}' with mean.") | |
else: | |
df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0]) | |
logger.info(f"Filled categorical missing values in '{col}' with mode.") | |
# Encode categorical variables | |
logger.info("Encoding categorical variables...") | |
for col in df_processed.select_dtypes(include=['object', 'category']).columns: | |
logger.info(f"Encoding column: {col}") | |
self.label_encoders[col] = LabelEncoder() | |
df_processed[col] = self.label_encoders[col].fit_transform(df_processed[col]) | |
# Scale numerical variables | |
logger.info("Scaling numerical variables...") | |
numeric_cols = df_processed.select_dtypes(include=[np.number]).columns | |
if len(numeric_cols) > 0: | |
# Exclude columns that are now effectively categorical (post-label encoding) | |
# This is a heuristic; ideally, identify original numeric columns. | |
cols_to_scale = [col for col in numeric_cols if col not in self.label_encoders] | |
if cols_to_scale: | |
df_processed[cols_to_scale] = self.scaler.fit_transform(df_processed[cols_to_scale]) | |
logger.info(f"Scaled numeric columns: {cols_to_scale}") | |
logger.info(f"Preprocessed DataFrame shape: {df_processed.shape}") | |
return df_processed | |
except Exception as e: | |
logger.exception(f"Error preprocessing data: {str(e)}") | |
raise | |
def summarize_dataframe_for_chatbot(data_list): | |
""" | |
Generates a test summary of the DataFrame for chatbot interaction.""" | |
if not data_list: | |
return "No data loaded." | |
df = pd.DataFrame(data_list) | |
nums_rows, num_cols = df.shape | |
col_info = [] | |
for col in df.columns: | |
dtype = df[col].dtype | |
unique_vals = df[col].nunique() | |
missing_count = df[col].isnull().sum() | |
info = f"-{col} (Type:{dtype}" | |
if pd.api.types.is_numeric_dtype(df[col]): | |
info +=f", Min:{df[col].min():.2f}, Max:{df[col].max():.2f}" | |
else: | |
info += f", Unique:{unique_vals}" | |
if missing_count > 0: | |
info += f", Missing:{missing_count}" | |
info += ")" | |
col_info.append(info) | |
summary = (f"Dataset Summary:\n- Rows: {nums_rows}, Columns: {num_cols}\nColumns:\n" + "\n".join(col_info)) | |
return summary |