| import streamlit as st |
| import pandas as pd |
| import plotly.express as px |
| import plotly.graph_objects as go |
| from io import BytesIO |
| import json |
| from datetime import datetime |
|
|
| |
| from data_processor import DataProcessor |
| from cleaning_engine import CleaningEngine |
| from config import Config |
| from utils.logger import setup_logger |
| from advanced_validation_ui import render_advanced_validation_tab |
|
|
| |
| st.set_page_config( |
| page_title="AI-Based Data Cleaner", |
| page_icon="π§Ή", |
| layout="wide", |
| initial_sidebar_state="expanded" |
| ) |
|
|
| |
| logger = setup_logger(__name__) |
|
|
| |
| if 'original_df' not in st.session_state: |
| st.session_state.original_df = None |
| if 'cleaned_df' not in st.session_state: |
| st.session_state.cleaned_df = None |
| if 'cleaning_report' not in st.session_state: |
| st.session_state.cleaning_report = None |
| if 'file_uploaded' not in st.session_state: |
| st.session_state.file_uploaded = False |
|
|
| def main(): |
| """Main application function""" |
| |
| |
| st.title("π§Ή AI-Based Data Cleaner") |
| st.markdown("Upload your Excel or CSV file and let AI clean your data automatically!") |
| |
| |
| with st.sidebar: |
| st.header("βοΈ Configuration") |
| |
| |
| if not Config.OPENAI_API_KEY: |
| st.error("β οΈ OpenAI API key not found! Please set OPENAI_API_KEY in your .env file.") |
| st.stop() |
| else: |
| st.success("β
OpenAI API key configured") |
| |
| st.header("π§ Cleaning Options") |
| cleaning_options = { |
| 'remove_duplicates': st.checkbox("Remove duplicate rows", value=True), |
| 'ai_text_cleaning': st.checkbox("AI-powered text cleaning", value=True), |
| 'fill_missing_values': st.checkbox("Fill missing values", value=True), |
| 'use_ai_for_missing': st.checkbox("Use AI for missing values (small datasets)", value=False), |
| 'fix_data_types': st.checkbox("Fix data types", value=True), |
| 'handle_outliers': st.checkbox("Handle outliers", value=False) |
| } |
| |
| if st.button("βΉοΈ About"): |
| show_about() |
| |
| |
| tab1, tab2, tab3, tab4, tab5 = st.tabs(["π Upload & Clean", "π Data Analysis", "π Before/After", "π¬ Advanced Validation", "π₯ Download"]) |
| |
| with tab1: |
| upload_and_clean_tab(cleaning_options) |
| |
| with tab2: |
| data_analysis_tab() |
| |
| with tab3: |
| before_after_tab() |
| |
| with tab4: |
| advanced_validation_tab() |
|
|
| with tab5: |
| download_tab() |
|
|
| def upload_and_clean_tab(cleaning_options): |
| """File upload and cleaning tab""" |
| |
| st.header("π Upload Your Data File") |
| |
| uploaded_file = st.file_uploader( |
| "Choose a CSV or Excel file", |
| type=['csv', 'xlsx', 'xls'], |
| help=f"Maximum file size: {Config.MAX_FILE_SIZE_MB}MB" |
| ) |
| |
| if uploaded_file is not None: |
| try: |
| |
| file_size_mb = len(uploaded_file.getvalue()) / (1024 * 1024) |
| if file_size_mb > Config.MAX_FILE_SIZE_MB: |
| st.error(f"File size ({file_size_mb:.1f}MB) exceeds maximum allowed size ({Config.MAX_FILE_SIZE_MB}MB)") |
| return |
| |
| |
| with st.spinner("Loading data..."): |
| data_processor = DataProcessor() |
| df = data_processor.read_file(uploaded_file.getvalue(), uploaded_file.name) |
| st.session_state.original_df = df |
| st.session_state.file_uploaded = True |
| |
| st.success(f"β
File loaded successfully! Shape: {df.shape}") |
| |
| |
| st.subheader("π Data Preview") |
| st.dataframe(df.head(10), use_container_width=True) |
| |
| |
| st.subheader("π Data Quality Overview") |
| show_data_quality_overview(df) |
| |
| |
| if st.button("π§Ή Clean Data", type="primary", use_container_width=True): |
| clean_data(df, cleaning_options) |
| |
| except Exception as e: |
| st.error(f"Error loading file: {str(e)}") |
| logger.error(f"Error loading file {uploaded_file.name}: {str(e)}") |
|
|
| def clean_data(df, cleaning_options): |
| """Clean the data using the cleaning engine""" |
| |
| try: |
| with st.spinner("π€ AI is cleaning your data... This may take a few minutes."): |
| |
| cleaning_engine = CleaningEngine() |
| |
| |
| cleaned_df, cleaning_report = cleaning_engine.clean_dataset(df, cleaning_options) |
| |
| |
| st.session_state.cleaned_df = cleaned_df |
| st.session_state.cleaning_report = cleaning_report |
| |
| st.success("β
Data cleaning completed!") |
| |
| |
| if cleaning_report['statistics'].get('summary'): |
| summary = cleaning_report['statistics']['summary'] |
| |
| col1, col2, col3, col4 = st.columns(4) |
| with col1: |
| st.metric("Total Changes", summary['total_changes']) |
| with col2: |
| st.metric("Missing Values", |
| f"{summary['missing_values_after']}", |
| f"-{summary['missing_values_before'] - summary['missing_values_after']}") |
| with col3: |
| st.metric("Rows", summary['rows_after'], |
| f"{summary['rows_after'] - summary['rows_before']}") |
| with col4: |
| st.metric("Missing Reduction", f"{summary['missing_reduction_percentage']:.1f}%") |
| |
| |
| if cleaning_report.get('errors'): |
| st.warning("β οΈ Some issues occurred during cleaning:") |
| for error in cleaning_report['errors']: |
| st.write(f"β’ {error}") |
| |
| except Exception as e: |
| st.error(f"Error during data cleaning: {str(e)}") |
| logger.error(f"Error during data cleaning: {str(e)}") |
|
|
| def show_data_quality_overview(df): |
| """Show data quality overview""" |
| |
| col1, col2, col3, col4 = st.columns(4) |
| |
| with col1: |
| st.metric("Rows", f"{df.shape[0]:,}") |
| with col2: |
| st.metric("Columns", df.shape[1]) |
| with col3: |
| missing_count = df.isnull().sum().sum() |
| st.metric("Missing Values", f"{missing_count:,}") |
| with col4: |
| duplicate_count = df.duplicated().sum() |
| st.metric("Duplicates", f"{duplicate_count:,}") |
| |
| |
| if missing_count > 0: |
| st.subheader("Missing Values by Column") |
| missing_data = df.isnull().sum() |
| missing_data = missing_data[missing_data > 0].sort_values(ascending=False) |
| |
| if len(missing_data) > 0: |
| fig = px.bar( |
| x=missing_data.values, |
| y=missing_data.index, |
| orientation='h', |
| title="Missing Values Count by Column" |
| ) |
| fig.update_layout(height=400) |
| st.plotly_chart(fig, use_container_width=True) |
|
|
| def data_analysis_tab(): |
| """Data analysis and statistics tab""" |
| |
| if st.session_state.original_df is None: |
| st.info("Please upload a file first.") |
| return |
| |
| st.header("π Data Analysis") |
| |
| df = st.session_state.original_df |
| |
| |
| selected_columns = st.multiselect( |
| "Select columns to analyze:", |
| df.columns.tolist(), |
| default=df.columns.tolist()[:5] |
| ) |
| |
| if selected_columns: |
| |
| st.subheader("π Basic Statistics") |
| st.dataframe(df[selected_columns].describe(), use_container_width=True) |
| |
| |
| st.subheader("π·οΈ Data Types") |
| dtype_df = pd.DataFrame({ |
| 'Column': selected_columns, |
| 'Data Type': [str(df[col].dtype) for col in selected_columns], |
| 'Non-Null Count': [df[col].count() for col in selected_columns], |
| 'Unique Values': [df[col].nunique() for col in selected_columns] |
| }) |
| st.dataframe(dtype_df, use_container_width=True) |
|
|
| def before_after_tab(): |
| """Before/after comparison tab""" |
| |
| if st.session_state.original_df is None or st.session_state.cleaned_df is None: |
| st.info("Please upload and clean a file first.") |
| return |
| |
| st.header("π Before/After Comparison") |
| |
| original_df = st.session_state.original_df |
| cleaned_df = st.session_state.cleaned_df |
| |
| |
| column = st.selectbox("Select column to compare:", original_df.columns) |
| |
| col1, col2 = st.columns(2) |
| |
| with col1: |
| st.subheader("π Before Cleaning") |
| st.dataframe(original_df[column].head(20), use_container_width=True) |
| |
| |
| if original_df[column].dtype in ['int64', 'float64']: |
| st.write("**Statistics:**") |
| st.write(f"Mean: {original_df[column].mean():.2f}") |
| st.write(f"Missing: {original_df[column].isnull().sum()}") |
| |
| with col2: |
| st.subheader("β¨ After Cleaning") |
| st.dataframe(cleaned_df[column].head(20), use_container_width=True) |
| |
| |
| if cleaned_df[column].dtype in ['int64', 'float64']: |
| st.write("**Statistics:**") |
| st.write(f"Mean: {cleaned_df[column].mean():.2f}") |
| st.write(f"Missing: {cleaned_df[column].isnull().sum()}") |
| |
| |
| if st.session_state.cleaning_report: |
| st.subheader("π Changes Made") |
| changes = st.session_state.cleaning_report.get('changes', []) |
| column_changes = [change for change in changes if change.get('column') == column] |
| |
| if column_changes: |
| for change in column_changes: |
| st.write(f"β’ **{change['type']}**: {change}") |
| else: |
| st.info("No changes made to this column.") |
|
|
| def download_tab(): |
| """Download cleaned data tab""" |
| |
| if st.session_state.cleaned_df is None: |
| st.info("Please clean your data first.") |
| return |
| |
| st.header("π₯ Download Cleaned Data") |
| |
| cleaned_df = st.session_state.cleaned_df |
| |
| |
| file_format = st.radio("Select download format:", ["CSV", "Excel"]) |
| |
| |
| if file_format == "CSV": |
| csv_buffer = BytesIO() |
| cleaned_df.to_csv(csv_buffer, index=False) |
| csv_data = csv_buffer.getvalue() |
| |
| st.download_button( |
| label="π₯ Download CSV", |
| data=csv_data, |
| file_name=f"cleaned_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv", |
| mime="text/csv", |
| use_container_width=True |
| ) |
| |
| else: |
| excel_buffer = BytesIO() |
| with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer: |
| cleaned_df.to_excel(writer, sheet_name='Cleaned_Data', index=False) |
| |
| |
| if st.session_state.cleaning_report: |
| report_df = pd.DataFrame(st.session_state.cleaning_report['changes']) |
| if not report_df.empty: |
| report_df.to_excel(writer, sheet_name='Cleaning_Report', index=False) |
| |
| excel_data = excel_buffer.getvalue() |
| |
| st.download_button( |
| label="π₯ Download Excel", |
| data=excel_data, |
| file_name=f"cleaned_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx", |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
| use_container_width=True |
| ) |
| |
| |
| if st.session_state.cleaning_report: |
| st.subheader("π Cleaning Report") |
| |
| with st.expander("View detailed cleaning report"): |
| st.json(st.session_state.cleaning_report) |
|
|
| def advanced_validation_tab(): |
| """Advanced data quality validation tab""" |
| if st.session_state.original_df is not None: |
| render_advanced_validation_tab(st.session_state.original_df) |
| else: |
| st.info("π Please upload a data file first to perform advanced validation.") |
| st.markdown(""" |
| ### π¬ Advanced Data Quality Validation |
| |
| This feature provides comprehensive analysis of your data for: |
| |
| - **π Duplicate Identity Detection**: Find duplicate emails, phone numbers, and contact information |
| - **π Data Pattern Anomalies**: Detect suspicious clustering and artificial standardization |
| - **βοΈ Business Logic Violations**: Identify chronological inconsistencies and employment logic errors |
| - **π― Contextual Integrity Issues**: Spot bulk import patterns and unrealistic data ranges |
| |
| Upload your data file to get started! |
| """) |
|
|
| def show_about(): |
| """Show about information""" |
| st.info(""" |
| **AI-Based Data Cleaner** π§Ή |
| |
| This application uses artificial intelligence to automatically clean and validate your data: |
| |
| **Features:** |
| β’ π€ AI-powered text cleaning and spelling correction |
| β’ π Intelligent missing value imputation |
| β’ π Duplicate detection and removal |
| β’ π Data type optimization |
| β’ π Outlier detection and handling |
| |
| **Supported Formats:** |
| β’ CSV files (.csv) |
| β’ Excel files (.xlsx, .xls) |
| |
| **Requirements:** |
| β’ OpenAI API key for AI features |
| β’ Maximum file size: 50MB |
| """) |
|
|
| if __name__ == "__main__": |
| main() |