Spaces:
Sleeping
Sleeping
| # app.py - Deploy to Hugging Face Space (New β Gradio β Paste this) | |
| # app.py - Deploy-ready with auto-generated sample data | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| from typing import Tuple, Dict, Any | |
| import io | |
| import base64 | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| def generate_sample_data(): | |
| """Creates a messy CSV for demo purposes - runs on Space startup""" | |
| np.random.seed(42) | |
| n = 1000 | |
| df = pd.DataFrame({ | |
| 'customer_id': [f'CUST_{i:05d}' for i in range(n)], # High cardinality | |
| 'purchase_date': pd.date_range('2023-01-01', periods=n).strftime('%Y-%m-%d %H:%M:%S'), | |
| 'product_sku': np.random.choice([f'SKU_{i}' for i in range(800)], n), # Another high-cardinality | |
| 'price': np.random.uniform(10, 1000, n).astype('float64'), # Memory waste | |
| 'quantity': np.random.randint(1, 10, n).astype('int64'), # More memory waste | |
| 'temperature': np.random.normal(20, 5, n), # For interpolation demo | |
| 'category': np.random.choice(['A', 'B', 'C', None], n, p=[0.3, 0.3, 0.3, 0.1]), # Missing values | |
| 'target': np.random.choice([0, 1], n) # Binary target | |
| }) | |
| # Introduce missingness that correlates with target (leakage) | |
| df.loc[df['target'] == 1, 'temperature'] = np.nan | |
| # Mess up datetime format for some rows | |
| df.loc[::100, 'purchase_date'] = df.loc[::100, 'purchase_date'].str.replace(' ', 'T') | |
| # Add constant column (silent failure) | |
| df['version'] = 'v1.0' | |
| df.to_csv('sample_messy_data.csv', index=False) | |
| return 'sample_messy_data.csv' | |
| class DSPreprocessor: | |
| """Auto-fixes the 5 things that waste your time""" | |
| def __init__(self): | |
| self.report = {"actions": [], "warnings": [], "stats": {}} | |
| def fit_transform(self, df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]: | |
| # 1. Memory Killer: Downcast numeric types (50-90% RAM savings) | |
| start_mem = df.memory_usage(deep=True).sum() / 1024**2 | |
| for col in df.select_dtypes(include=['int64', 'float64']).columns: | |
| col_type = df[col].dtype | |
| try: | |
| if col_type == 'int64': | |
| df[col] = pd.to_numeric(df[col], downcast='integer') | |
| else: | |
| df[col] = pd.to_numeric(df[col], downcast='float') | |
| if df[col].dtype != col_type: | |
| self.report["actions"].append(f"β {col}: {col_type} β {df[col].dtype}") | |
| except: | |
| pass | |
| # 2. DateTime Hell: Auto-detect and parse | |
| for col in df.select_dtypes(include=['object']).columns: | |
| try: | |
| parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True) | |
| if parsed.notnull().sum() > len(df) * 0.3: | |
| df[col] = parsed | |
| self.report["actions"].append(f"β {col}: Parsed datetime ({parsed.notnull().sum()} valid)") | |
| except: | |
| pass | |
| # 3. Categorical Explosion: Hash high-cardinality strings | |
| for col in df.select_dtypes(include=['object']).columns: | |
| n_unique = df[col].nunique() | |
| if n_unique > len(df) * 0.5: | |
| df[col] = df[col].astype('category').cat.codes | |
| self.report["warnings"].append( | |
| f"β οΈ {col}: {n_unique:,} unique values β Hashed to codes" | |
| ) | |
| # 4. Missing Target Leakage: Flag correlated missingness | |
| missing_corr = df.isnull().corr() | |
| high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index() | |
| high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']] | |
| if not high_corr.empty: | |
| for _, row in high_corr.iterrows(): | |
| self.report["warnings"].append( | |
| f"β οΈ Missingness correlation: {row['level_0']} β {row['level_1']} (r={row[0]:.2f})" | |
| ) | |
| # 5. Silent Failures: Detect constant columns | |
| constant_cols = [col for col in df.columns if df[col].nunique() <= 1] | |
| if constant_cols: | |
| self.report["warnings"].append(f"β οΈ Constant columns (drop these): {constant_cols}") | |
| # Final stats | |
| end_mem = df.memory_usage(deep=True).sum() / 1024**2 | |
| self.report["stats"] = { | |
| "Memory saved": f"{start_mem - end_mem:.1f} MB ({100*(1-end_mem/start_mem):.0f}%)", | |
| "Rows": len(df), | |
| "Columns": len(df.columns), | |
| "Dtypes optimized": len([a for a in self.report["actions"] if "β" in a]) | |
| } | |
| return df, self.report | |
| def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, str]: | |
| """Main function for Gradio""" | |
| if file_obj is None: | |
| return None, None, "Upload a CSV first" | |
| df = pd.read_csv(file_obj.name) | |
| preprocessor = DSPreprocessor() | |
| if target_col and target_col in df.columns: | |
| df = df[[c for c in df.columns if c != target_col] + [target_col]] | |
| cleaned_df, report = preprocessor.fit_transform(df) | |
| csv_bytes = cleaned_df.to_csv(index=False).encode() | |
| b64 = base64.b64encode(csv_bytes).decode() | |
| href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>' | |
| return cleaned_df, report, href | |
| # Generate sample data on startup | |
| sample_file = generate_sample_data() | |
| # UI | |
| with gr.Blocks(title="DS AutoPrep") as demo: | |
| gr.Markdown("# π DS AutoPrep\n**Zero-config CSV cleaning + leak detection + memory optimization**") | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload CSV", file_types=[".csv"]) | |
| target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., target") | |
| go_btn = gr.Button("π₯ Clean My Data", variant="primary") | |
| with gr.Tabs(): | |
| with gr.TabItem("Cleaned Data"): | |
| data_output = gr.Dataframe() | |
| with gr.TabItem("Audit Report"): | |
| report_output = gr.JSON() | |
| with gr.TabItem("Download"): | |
| download_html = gr.HTML() | |
| # Auto-load example on startup | |
| gr.Examples( | |
| examples=[sample_file], | |
| inputs=[file_input], | |
| label="Try with sample data" | |
| ) | |
| go_btn.click( | |
| fn=process_file, | |
| inputs=[file_input, target_input], | |
| outputs=[data_output, report_output, download_html] | |
| ) | |
| demo.launch() |