AI-OMS-Analyze / scripts /data_cleansing.py
kawaiipeace's picture
Initialization
d4d1ca8
raw
history blame
1.39 kB
import pandas as pd
import numpy as np
def cleanse_data(df, remove_duplicates, missing_strategy):
"""
Perform data cleansing on the dataframe.
Args:
df: pandas DataFrame
remove_duplicates: bool, whether to remove duplicate rows
missing_strategy: str, 'drop', 'impute_mean', 'impute_median', 'impute_mode'
Returns:
df_clean: cleaned DataFrame
original_shape: tuple (rows, cols) before cleansing
cleaned_shape: tuple (rows, cols) after cleansing
"""
df = df.copy()
original_shape = df.shape
# Remove duplicates
if remove_duplicates:
df = df.drop_duplicates()
# Handle missing values
if missing_strategy == 'drop':
df = df.dropna()
elif missing_strategy in ['impute_mean', 'impute_median']:
for col in df.select_dtypes(include=[np.number]).columns:
if missing_strategy == 'impute_mean':
df[col] = df[col].fillna(df[col].mean())
elif missing_strategy == 'impute_median':
df[col] = df[col].fillna(df[col].median())
elif missing_strategy == 'impute_mode':
for col in df.columns:
mode_val = df[col].mode()
if not mode_val.empty:
df[col] = df[col].fillna(mode_val[0])
cleaned_shape = df.shape
return df, original_shape, cleaned_shape