|
|
|
|
|
"""Data Cleaning Plugin""" |
|
|
import pandas as pd |
|
|
from typing import Dict, Any |
|
|
|
|
|
class DataCleaner: |
|
|
"""Clean and standardize messy data for analytics.""" |
|
|
def clean_dataframe(self, df: pd.DataFrame) -> pd.DataFrame: |
|
|
df = df.copy() |
|
|
df.columns = df.columns.astype(str).str.strip().str.lower().str.replace(' ', '_').str.replace(r'[^a-z0-9_]', '', regex=True) |
|
|
df = df.dropna(how='all', axis=0).dropna(how='all', axis=1) |
|
|
null_values = ['', 'null', 'NULL', 'None', 'N/A', 'n/a', '#N/A', '-', '?', 'unknown'] |
|
|
for col in df.select_dtypes(include=['object', 'string']).columns: |
|
|
df[col] = df[col].astype(str).str.strip().replace(null_values, pd.NA) |
|
|
df = df.drop_duplicates() |
|
|
return df |
|
|
|
|
|
def enforce_schema(self, df: pd.DataFrame) -> pd.DataFrame: |
|
|
df = df.copy() |
|
|
for col in df.columns: |
|
|
try: |
|
|
if 'date' in col or 'time' in col: |
|
|
df[col] = pd.to_datetime(df[col], errors='coerce') |
|
|
elif any(kw in col for kw in ['amount', 'price', 'cost', 'value', 'count']): |
|
|
df[col] = pd.to_numeric(df[col], errors='coerce') |
|
|
except: pass |
|
|
return df |
|
|
|