Spaces:
Sleeping
Sleeping
import pandas as pd | |
from sklearn.preprocessing import StandardScaler, LabelEncoder | |
def preprocess_data(df, target_col, missing_strategy="drop", transformation_map=None): | |
df = df.copy() | |
# 1. Handle missing values | |
if missing_strategy == "drop": | |
df = df.dropna() | |
elif missing_strategy in ["mean", "median"]: | |
numeric_cols = df.select_dtypes(include=["number"]).columns | |
non_numeric_cols = df.columns.difference(numeric_cols) | |
if missing_strategy == "mean": | |
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean()) | |
else: | |
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median()) | |
for col in non_numeric_cols: | |
if df[col].isna().sum() > 0: | |
df[col] = df[col].fillna(df[col].mode()[0]) | |
elif missing_strategy == "mode": | |
for col in df.columns: | |
if df[col].isna().sum() > 0: | |
df[col] = df[col].fillna(df[col].mode()[0]) | |
# 2. Apply feature transformations | |
if transformation_map: | |
for col, transform in transformation_map.items(): | |
if transform == "Label Encode": | |
if df[col].dtype == "object" or str(df[col].dtype).startswith("category"): | |
df[col] = LabelEncoder().fit_transform(df[col]) | |
else: | |
df[col] = LabelEncoder().fit_transform(df[col].astype(str)) | |
elif transform == "Normalize": | |
scaler = StandardScaler() | |
df[[col]] = scaler.fit_transform(df[[col]]) | |
# "No Transformation" = leave column as is | |
# 3. Label encode target column if it's a string | |
if target_col and target_col in df.columns: | |
if df[target_col].dtype == "object" or str(df[target_col].dtype).startswith("category"): | |
df[target_col] = LabelEncoder().fit_transform(df[target_col]) | |
return df | |