# app.py - Deploy to Hugging Face Space (New → Gradio → Paste this) # app.py - Deploy-ready with auto-generated sample data import gradio as gr import pandas as pd import numpy as np from typing import Tuple, Dict, Any import io import base64 import warnings warnings.filterwarnings('ignore') def generate_sample_data(): """Creates a messy CSV for demo purposes - runs on Space startup""" np.random.seed(42) n = 1000 df = pd.DataFrame({ 'customer_id': [f'CUST_{i:05d}' for i in range(n)], # High cardinality 'purchase_date': pd.date_range('2023-01-01', periods=n).strftime('%Y-%m-%d %H:%M:%S'), 'product_sku': np.random.choice([f'SKU_{i}' for i in range(800)], n), # Another high-cardinality 'price': np.random.uniform(10, 1000, n).astype('float64'), # Memory waste 'quantity': np.random.randint(1, 10, n).astype('int64'), # More memory waste 'temperature': np.random.normal(20, 5, n), # For interpolation demo 'category': np.random.choice(['A', 'B', 'C', None], n, p=[0.3, 0.3, 0.3, 0.1]), # Missing values 'target': np.random.choice([0, 1], n) # Binary target }) # Introduce missingness that correlates with target (leakage) df.loc[df['target'] == 1, 'temperature'] = np.nan # Mess up datetime format for some rows df.loc[::100, 'purchase_date'] = df.loc[::100, 'purchase_date'].str.replace(' ', 'T') # Add constant column (silent failure) df['version'] = 'v1.0' df.to_csv('sample_messy_data.csv', index=False) return 'sample_messy_data.csv' class DSPreprocessor: """Auto-fixes the 5 things that waste your time""" def __init__(self): self.report = {"actions": [], "warnings": [], "stats": {}} def fit_transform(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]: # 1. Memory Killer: Downcast numeric types (50-90% RAM savings) start_mem = df.memory_usage(deep=True).sum() / 1024**2 for col in df.select_dtypes(include=['int64', 'float64']).columns: col_type = df[col].dtype try: if col_type == 'int64': df[col] = pd.to_numeric(df[col], downcast='integer') else: df[col] = pd.to_numeric(df[col], downcast='float') if df[col].dtype != col_type: self.report["actions"].append(f"✓ {col}: {col_type} → {df[col].dtype}") except: pass # 2. DateTime Hell: Auto-detect and parse for col in df.select_dtypes(include=['object']).columns: try: parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True) if parsed.notnull().sum() > len(df) * 0.3: df[col] = parsed self.report["actions"].append(f"✓ {col}: Parsed datetime ({parsed.notnull().sum()} valid)") except: pass # 3. Categorical Explosion: Hash high-cardinality strings for col in df.select_dtypes(include=['object']).columns: n_unique = df[col].nunique() if n_unique > len(df) * 0.5: df[col] = df[col].astype('category').cat.codes self.report["warnings"].append( f"⚠️ {col}: {n_unique:,} unique values → Hashed to codes" ) # 4. Missing Target Leakage: Flag correlated missingness missing_corr = df.isnull().corr() high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index() high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']] if not high_corr.empty: for _, row in high_corr.iterrows(): self.report["warnings"].append( f"⚠️ Missingness correlation: {row['level_0']} ↔ {row['level_1']} (r={row[0]:.2f})" ) # 5. Silent Failures: Detect constant columns constant_cols = [col for col in df.columns if df[col].nunique() <= 1] if constant_cols: self.report["warnings"].append(f"⚠️ Constant columns (drop these): {constant_cols}") # Final stats end_mem = df.memory_usage(deep=True).sum() / 1024**2 self.report["stats"] = { "Memory saved": f"{start_mem - end_mem:.1f} MB ({100*(1-end_mem/start_mem):.0f}%)", "Rows": len(df), "Columns": len(df.columns), "Dtypes optimized": len([a for a in self.report["actions"] if "→" in a]) } return df, self.report def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, str]: """Main function for Gradio""" if file_obj is None: return None, None, "Upload a CSV first" df = pd.read_csv(file_obj.name) preprocessor = DSPreprocessor() if target_col and target_col in df.columns: df = df[[c for c in df.columns if c != target_col] + [target_col]] cleaned_df, report = preprocessor.fit_transform(df) csv_bytes = cleaned_df.to_csv(index=False).encode() b64 = base64.b64encode(csv_bytes).decode() href = f'Download Cleaned CSV' return cleaned_df, report, href # Generate sample data on startup sample_file = generate_sample_data() # UI with gr.Blocks(title="DS AutoPrep") as demo: gr.Markdown("# 🚀 DS AutoPrep\n**Zero-config CSV cleaning + leak detection + memory optimization**") with gr.Row(): file_input = gr.File(label="Upload CSV", file_types=[".csv"]) target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., target") go_btn = gr.Button("🔥 Clean My Data", variant="primary") with gr.Tabs(): with gr.TabItem("Cleaned Data"): data_output = gr.Dataframe() with gr.TabItem("Audit Report"): report_output = gr.JSON() with gr.TabItem("Download"): download_html = gr.HTML() # Auto-load example on startup gr.Examples( examples=[sample_file], inputs=[file_input], label="Try with sample data" ) go_btn.click( fn=process_file, inputs=[file_input, target_input], outputs=[data_output, report_output, download_html] ) demo.launch()