Data-Cleaner / data_clean_simple.py
translators-will's picture
Update data_clean_simple.py
2d0c05d verified
# data_clean_simple.py
import pandas as pd
import streamlit as st
import re
from functools import lru_cache
# Dictionary of common strategies for data cleaning
cleaning_strategies = {
"date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.",
"numeric": "For numeric columns, consider filling missing values with the mean or median of the column.",
"text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.",
"categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'."
}
@lru_cache(maxsize=32)
def get_column_type(column_name):
column_lower = column_name.lower()
if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']):
return 'date'
elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']):
return 'numeric'
elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']):
return 'text'
elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']):
return 'categorical'
else:
if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower):
return 'numeric'
return 'text'
def suggest_fill_strategies(column_name, examples):
column_type = get_column_type(column_name)
# Create a basic analysis of the data
valid_examples = [ex for ex in examples if pd.notna(ex) and ex != '']
# Build a simple suggestion based on column type and examples
suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text'])
if valid_examples:
suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}."
# For numeric data, add statistics
if column_type == 'numeric':
mean_value = pd.Series(valid_examples).astype(float).mean()
median_value = pd.Series(valid_examples).astype(float).median()
suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}."
return suggestion
def clean_data(file_path, progress_callback=None):
# Support CSV and TSV files
# Load data and drop duplicates
# Clean data with progress updates
if file_path.endswith('.tsv'):
df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy()
else:
df = pd.read_csv(file_path).drop_duplicates().copy()
suggestions_log = []
total_columns = len(df.columns)
# Convert column types
for i, col in enumerate(df.columns):
# Update progress if callback provided
if progress_callback:
progress = i / total_columns
progress_callback(progress)
if df[col].dtype == 'object':
df[col] = df[col].str.strip().str.lower() # Normalize text
# Escape newline characters
df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False)
if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']):
df[col] = df[col].str.replace(r'[^\d]', '', regex=True)
# Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001)
df[col] = df[col].replace(
r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True
)
# Remove currency symbols and commas
if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any():
df[col] = df[col].str.replace(r'[$,]', '', regex=True)
# Always try to convert to numeric if possible
if col.lower().find('id') == -1: # Skip ID columns which should remain as strings
try:
df[col] = pd.to_numeric(df[col], errors='ignore')
except:
return None
# Check for missing or weird values
null_count = df[col].isnull().sum()
empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0
pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown',
na=False, case=False, regex=True).sum()
# Generate suggestions for missing or weird values
if null_count > 0 or empty_str_count > 0 or pattern_matches > 0:
# Get non-null, non-empty examples for analysis - handle empty dataframes
try:
good_df = df[col][
df[col].notnull() &
(df[col].astype(str) != "") &
~df[col].astype(str).contains(r'none|null|n/a|na|\?+|missing|unknown',
na=False, case=False, regex=True)
]
if len(good_df) > 0:
sample_size = min(5, len(good_df))
good_examples = good_df.drop_duplicates().sample(n=sample_size, random_state=1).tolist()
else:
good_examples = []
except:
good_examples = []
# Generate suggestions
suggestion = suggest_fill_strategies(col, good_examples)
suggestions_log.append((col, suggestion))
# Final progress update
if progress_callback:
progress_callback(1.0)
# Reset index for consistency
df = df.reset_index(drop=True)
return df, suggestions_log
def display_suggestions_report(suggestions_log):
if suggestions_log:
st.subheader("🤖 Data Cleaning Suggestions")
for col, suggestion in suggestions_log:
st.markdown(f"**Column:** `{col}`")
if suggestion:
st.code(suggestion, language="python")
else:
st.write("No suggestions or response error.")