Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| def cleanse_data(df, remove_duplicates, missing_strategy): | |
| """ | |
| Perform data cleansing on the dataframe. | |
| Args: | |
| df: pandas DataFrame | |
| remove_duplicates: bool, whether to remove duplicate rows | |
| missing_strategy: str, 'drop', 'impute_mean', 'impute_median', 'impute_mode' | |
| Returns: | |
| df_clean: cleaned DataFrame | |
| original_shape: tuple (rows, cols) before cleansing | |
| cleaned_shape: tuple (rows, cols) after cleansing | |
| """ | |
| df = df.copy() | |
| original_shape = df.shape | |
| # Remove duplicates | |
| if remove_duplicates: | |
| df = df.drop_duplicates() | |
| # Handle missing values | |
| if missing_strategy == 'drop': | |
| df = df.dropna() | |
| elif missing_strategy in ['impute_mean', 'impute_median']: | |
| for col in df.select_dtypes(include=[np.number]).columns: | |
| if missing_strategy == 'impute_mean': | |
| df[col] = df[col].fillna(df[col].mean()) | |
| elif missing_strategy == 'impute_median': | |
| df[col] = df[col].fillna(df[col].median()) | |
| elif missing_strategy == 'impute_mode': | |
| for col in df.columns: | |
| mode_val = df[col].mode() | |
| if not mode_val.empty: | |
| df[col] = df[col].fillna(mode_val[0]) | |
| cleaned_shape = df.shape | |
| return df, original_shape, cleaned_shape |