|
|
|
import pandas as pd
|
|
import numpy as np
|
|
import re
|
|
|
|
def execute_plan(df, plan):
|
|
for step in plan:
|
|
col = step["column"]
|
|
action = step["action"]
|
|
method = step.get("method", "")
|
|
params = step.get("params", {})
|
|
reason = step.get("reason", "")
|
|
|
|
if action == "drop":
|
|
df = df.drop(columns=[col], errors="ignore")
|
|
|
|
elif action == "impute":
|
|
if method == "mean":
|
|
df[col] = df[col].fillna(df[col].mean())
|
|
elif method == "median":
|
|
df[col] = df[col].fillna(df[col].median())
|
|
elif method == "mode":
|
|
df[col] = df[col].fillna(df[col].mode().iloc[0])
|
|
elif method == "constant":
|
|
value = params.get("value", 0)
|
|
df[col] = df[col].fillna(value)
|
|
|
|
elif action == "standardize":
|
|
|
|
df[col] = df[col].astype(str).str.lower().str.strip()
|
|
if params.get("remove_special_chars", False):
|
|
df[col] = df[col].apply(lambda x: re.sub(r"[^\w\s]", "", x))
|
|
|
|
elif action == "normalize":
|
|
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
|
|
|
elif action == "scale":
|
|
if method == "zscore":
|
|
df[col] = (df[col] - df[col].mean()) / df[col].std()
|
|
elif method == "minmax":
|
|
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
|
|
|
elif action == "convert_dtype":
|
|
try:
|
|
if method == "int":
|
|
df[col] = df[col].astype(int)
|
|
elif method == "float":
|
|
df[col] = df[col].astype(float)
|
|
elif method == "str":
|
|
df[col] = df[col].astype(str)
|
|
elif method == "datetime":
|
|
df[col] = pd.to_datetime(df[col], errors='coerce')
|
|
except Exception as e:
|
|
print(f"Could not convert {col} to {method}: {e}")
|
|
|
|
elif action == "clip_outliers":
|
|
lower = params.get("lower", df[col].quantile(0.01))
|
|
upper = params.get("upper", df[col].quantile(0.99))
|
|
df[col] = np.clip(df[col], lower, upper)
|
|
|
|
elif action == "fill_outliers":
|
|
method = method or "median"
|
|
q1 = df[col].quantile(0.25)
|
|
q3 = df[col].quantile(0.75)
|
|
iqr = q3 - q1
|
|
lower_bound = q1 - 1.5 * iqr
|
|
upper_bound = q3 + 1.5 * iqr
|
|
|
|
mask = (df[col] < lower_bound) | (df[col] > upper_bound)
|
|
replacement = df[col].median() if method == "median" else df[col].mean()
|
|
df.loc[mask, col] = replacement
|
|
|
|
elif action == "map_values":
|
|
mapping = params.get("mapping", {})
|
|
df[col] = df[col].replace(mapping)
|
|
|
|
elif action == "remove_duplicates":
|
|
df = df.drop_duplicates()
|
|
|
|
elif action == "strip_whitespace":
|
|
df[col] = df[col].astype(str).str.strip()
|
|
|
|
|
|
|
|
return df
|
|
|