Spaces:
Sleeping
Sleeping
| # data_clean_simple.py | |
| import pandas as pd | |
| import streamlit as st | |
| import re | |
| from functools import lru_cache | |
| # Dictionary of common strategies for data cleaning | |
| cleaning_strategies = { | |
| "date": "For date columns, consider forward-fill or backward-fill from existing dates, or converting to datetime format.", | |
| "numeric": "For numeric columns, consider filling missing values with the mean or median of the column.", | |
| "text": "For text columns, consider filling missing values with a placeholder like 'Unknown' or the most frequent value.", | |
| "categorical": "For categorical columns, consider filling missing values with the mode or a placeholder like 'Unknown'." | |
| } | |
| def get_column_type(column_name): | |
| column_lower = column_name.lower() | |
| if any(keyword in column_lower for keyword in ['date', 'year', 'time', 'timestamp']): | |
| return 'date' | |
| elif any(keyword in column_lower for keyword in ['price', 'amount', 'cost', 'quantity', 'value']): | |
| return 'numeric' | |
| elif any(keyword in column_lower for keyword in ['name', 'description', 'text', 'comment']): | |
| return 'text' | |
| elif any(keyword in column_lower for keyword in ['category', 'type', 'label', 'class']): | |
| return 'categorical' | |
| else: | |
| if re.search(r'(num|amt|count|age|height|weight|total|\d+)', column_lower): | |
| return 'numeric' | |
| return 'text' | |
| def suggest_fill_strategies(column_name, examples): | |
| column_type = get_column_type(column_name) | |
| # Create a basic analysis of the data | |
| valid_examples = [ex for ex in examples if pd.notna(ex) and ex != ''] | |
| # Build a simple suggestion based on column type and examples | |
| suggestion = cleaning_strategies.get(column_type, cleaning_strategies['text']) | |
| if valid_examples: | |
| suggestion += f"\n\nExample values: {', '.join(map(str, valid_examples[:3]))}." | |
| # For numeric data, add statistics | |
| if column_type == 'numeric': | |
| mean_value = pd.Series(valid_examples).astype(float).mean() | |
| median_value = pd.Series(valid_examples).astype(float).median() | |
| suggestion += f"Consider replacing values with\n\nMean: {mean_value:.2f}, Median: {median_value:.2f}." | |
| return suggestion | |
| def clean_data(file_path, progress_callback=None): | |
| # Support CSV and TSV files | |
| # Load data and drop duplicates | |
| # Clean data with progress updates | |
| if file_path.endswith('.tsv'): | |
| df = pd.read_csv(file_path, sep='\t').drop_duplicates().copy() | |
| else: | |
| df = pd.read_csv(file_path).drop_duplicates().copy() | |
| suggestions_log = [] | |
| total_columns = len(df.columns) | |
| # Convert column types | |
| for i, col in enumerate(df.columns): | |
| # Update progress if callback provided | |
| if progress_callback: | |
| progress = i / total_columns | |
| progress_callback(progress) | |
| if df[col].dtype == 'object': | |
| df[col] = df[col].str.strip().str.lower() # Normalize text | |
| # Escape newline characters | |
| df[col] = df[col].str.replace('\n', ' ', regex=False).replace('\r', ' ', regex=False) | |
| if any(keyword in col.lower() for keyword in ['date', 'year', 'time', 'timestamp']): | |
| df[col] = df[col].str.replace(r'[^\d]', '', regex=True) | |
| # Normalize 4-digit year ranges (e.g., 2000-2001, 2000--2001, 20002001) | |
| df[col] = df[col].replace( | |
| r'(?<!\d)(\d{4})\s*[-–—./]?\s*(\d{4})(?!\d)', r'\1-\2', regex=True | |
| ) | |
| # Remove currency symbols and commas | |
| if df[col].astype(str).str.contains(r'[$,]', na=False, regex=True).any(): | |
| df[col] = df[col].str.replace(r'[$,]', '', regex=True) | |
| # Always try to convert to numeric if possible | |
| if col.lower().find('id') == -1: # Skip ID columns which should remain as strings | |
| try: | |
| df[col] = pd.to_numeric(df[col], errors='ignore') | |
| except: | |
| return None | |
| # Check for missing or weird values | |
| null_count = df[col].isnull().sum() | |
| empty_str_count = (df[col] == '').sum() if df[col].dtype == 'object' else 0 | |
| pattern_matches = df[col].astype(str).str.contains(r'none|null|n/a|na|\?+missing|unknown', | |
| na=False, case=False, regex=True).sum() | |
| # Generate suggestions for missing or weird values | |
| if null_count > 0 or empty_str_count > 0 or pattern_matches > 0: | |
| # Get non-null, non-empty examples for analysis - handle empty dataframes | |
| try: | |
| good_df = df[col][ | |
| df[col].notnull() & | |
| (df[col].astype(str) != "") & | |
| ~df[col].astype(str).contains(r'none|null|n/a|na|\?+|missing|unknown', | |
| na=False, case=False, regex=True) | |
| ] | |
| if len(good_df) > 0: | |
| sample_size = min(5, len(good_df)) | |
| good_examples = good_df.drop_duplicates().sample(n=sample_size, random_state=1).tolist() | |
| else: | |
| good_examples = [] | |
| except: | |
| good_examples = [] | |
| # Generate suggestions | |
| suggestion = suggest_fill_strategies(col, good_examples) | |
| suggestions_log.append((col, suggestion)) | |
| # Final progress update | |
| if progress_callback: | |
| progress_callback(1.0) | |
| # Reset index for consistency | |
| df = df.reset_index(drop=True) | |
| return df, suggestions_log | |
| def display_suggestions_report(suggestions_log): | |
| if suggestions_log: | |
| st.subheader("🤖 Data Cleaning Suggestions") | |
| for col, suggestion in suggestions_log: | |
| st.markdown(f"**Column:** `{col}`") | |
| if suggestion: | |
| st.code(suggestion, language="python") | |
| else: | |
| st.write("No suggestions or response error.") | |