AutoML / src /preprocessing /clean_df_fallback.py
akash
all files
890025a
import re
import pandas as pd
import numpy as np
import streamlit as st
# Define fallback cleaning function
@st.cache_data
def clean_dataframe_fallback(df):
"""Hardcoded data cleaning pipeline"""
"""Generic data cleaning pipeline with categorical preservation"""
df_cleaned = df.copy()
df_cleaned = df_cleaned.applymap(
lambda x: re.sub(r"\(.*?\)", "", str(x)) if isinstance(x, str) else x)
# Remove 'ref.' references
df_cleaned = df_cleaned.applymap(
lambda x: re.sub(r"ref\.", "", str(x), flags=re.IGNORECASE) if isinstance(x, str) else x)
# Remove any other special characters except letters, digits, spaces, and dots
df_cleaned = df_cleaned.applymap(
lambda x: re.sub(r"[^\w\s\d\.]", "", str(x)).strip() if isinstance(x, str) else x
)
# Step 0 - Clean column names first
df_cleaned.columns = [col.strip().lower().replace(' ', '_') for col in df_cleaned.columns]
# Define measurement units to remove
measurement_units = {
'weight': r'\s*(kg|kilograms|lbs|pounds)$',
'height': r'\s*(cm|centimeters|inches|feet|ft)$'
}
# Step 1 - Remove redundant columns
# Preservation patterns for categorical columns
preserve_pattern = re.compile(r'(name|brand|model|type|category|region|text|desc|color|size)', re.IGNORECASE)
preserved_cols = [col for col in df_cleaned.columns if preserve_pattern.search(col)]
# ID pattern detection
id_pattern = re.compile(r'(_id|id_|num|no|number|identifier|code|idx|row)', re.IGNORECASE)
id_cols = [col for col in df_cleaned.columns if id_pattern.search(col) and col not in preserved_cols]
# Unique value columns
unique_cols = [col for col in df_cleaned.columns
if df_cleaned[col].nunique() == len(df_cleaned)
and col not in preserved_cols]
redundant_cols = list(set(id_cols + unique_cols))
df_cleaned = df_cleaned.drop(columns=redundant_cols)
print(f"Removed {len(redundant_cols)} redundant columns: {redundant_cols}")
# Step 2 - Enhanced numeric detection with categorical protection
for col in df_cleaned.columns:
if col in preserved_cols:
print(f"Preserving categorical column: {col}")
continue # Skip preserved columns
if any(unit in col for unit in measurement_units.keys()):
pattern = measurement_units.get(col.split('_')[0], r'')
df_cleaned[col] = df_cleaned[col].astype(str).str.replace(pattern, '', regex=True).str.strip()
if pd.api.types.is_numeric_dtype(df_cleaned[col]):
continue
# Strict numeric pattern detection
non_null_count = df_cleaned[col].dropna().shape[0]
sample_size = min(100, non_null_count)
sample = df_cleaned[col].dropna().sample(sample_size, random_state=42)
numeric_pattern = r'^[-+]?\d*\.?\d+$' # Full string match
num_matches = sample.astype(str).str.fullmatch(numeric_pattern).mean()
if num_matches > 0.8: # High threshold
# Conservative cleaning
cleaned = df_cleaned[col].replace(r'[^\d\.\-]', '', regex=True)
converted = pd.to_numeric(cleaned, errors='coerce')
success_rate = converted.notna().mean()
if success_rate > 0.9: # Strict success requirement
df_cleaned[col] = converted
print(f"Converted {col} to numeric (success: {success_rate:.1%})")
# Step 3 - Date detection
date_cols = []
for col in df_cleaned.select_dtypes(exclude=np.number).columns:
if col in preserved_cols:
continue
try:
df_cleaned[col] = pd.to_datetime(df_cleaned[col], errors='raise')
date_cols.append(col)
print(f"Detected datetime: {col}")
except:
pass
# Example manual approach:
currency_cols = [col for col in df_cleaned.columns if any(keyword in col.lower() for keyword in ["price", "gross", "budget"])]
for col in currency_cols:
df_cleaned[col] = df_cleaned[col].astype(str).str.replace(r'[^\d\.]', '', regex=True) # remove everything except digits & dots
df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')
# Step 4 - Missing value handling
numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
categorical_cols = df_cleaned.select_dtypes(exclude=np.number).columns
# Numeric imputation
for col in numeric_cols:
if df_cleaned[col].isna().any():
df_cleaned[f'{col}_missing'] = df_cleaned[col].isna().astype(int)
df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)
# Categorical imputation
for col in categorical_cols:
if df_cleaned[col].isna().any():
mode_val = df_cleaned[col].mode()[0] if not df_cleaned[col].mode().empty else 'Unknown'
df_cleaned[col] = df_cleaned[col].fillna(mode_val)
# Step 5 - Text normalization for non-preserved columns
text_cols = [col for col in categorical_cols if col not in preserved_cols]
for col in text_cols:
df_cleaned[col] = df_cleaned[col].astype(str).apply(lambda x: re.sub(r'\s+', ' ', re.sub(r'[^\w\s]', '', x)).strip().lower())
# Step 6 - Outlier handling (preserve categoricals)
numeric_cols = df_cleaned.select_dtypes(include=np.number).columns
for col in numeric_cols:
if df_cleaned[col].nunique() > 10:
q1 = df_cleaned[col].quantile(0.05)
q3 = df_cleaned[col].quantile(0.95)
df_cleaned[col] = np.clip(df_cleaned[col], q1, q3)
# Step 7 - Final validation
df_cleaned = df_cleaned.drop_duplicates().reset_index(drop=True)
return df_cleaned