Spaces:

omasteam
/

Preprocessing-Solver

Sleeping

App Files Files Community

omasteam commited on 19 days ago

Commit

4a7095a

verified ·

1 Parent(s): 0ab082c

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -21

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 # app.py - Deploy to Hugging Face Space (New → Gradio → Paste this)
 import gradio as gr
 import pandas as pd
 import numpy as np
@@ -8,6 +9,34 @@ import base64
 import warnings
 warnings.filterwarnings('ignore')
 class DSPreprocessor:
     """Auto-fixes the 5 things that waste your time"""
@@ -29,10 +58,9 @@ class DSPreprocessor:
             except:
                 pass
-        # 2. DateTime Hell: Auto-detect and parse (handles 3 formats in one column)
         for col in df.select_dtypes(include=['object']).columns:
             try:
-                # Try parsing if >30% looks like dates
                 parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
                 if parsed.notnull().sum() > len(df) * 0.3:
                     df[col] = parsed
@@ -40,16 +68,16 @@ class DSPreprocessor:
             except:
                 pass
-        # 3. Categorical Explosion: Hash high-cardinality strings (prevents memory blowup)
         for col in df.select_dtypes(include=['object']).columns:
             n_unique = df[col].nunique()
             if n_unique > len(df) * 0.5:
                 df[col] = df[col].astype('category').cat.codes
                 self.report["warnings"].append(
-                    f"⚠️ {col}: {n_unique:,} unique values → Hashed to codes (category leak risk)"
                 )
-        # 4. Missing Target Leakage: Flag if missingness correlates with any column
         missing_corr = df.isnull().corr()
         high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
         high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
@@ -60,7 +88,7 @@ class DSPreprocessor:
                     f"⚠️ Missingness correlation: {row['level_0']} ↔ {row['level_1']} (r={row[0]:.2f})"
                 )
-        # 5. Silent Failures: Detect constant columns (screw up scaling)
         constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
         if constant_cols:
             self.report["warnings"].append(f"⚠️ Constant columns (drop these): {constant_cols}")
@@ -84,30 +112,29 @@ def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, st
     df = pd.read_csv(file_obj.name)
     preprocessor = DSPreprocessor()
-    # Optional target column for leakage check
     if target_col and target_col in df.columns:
-        # Move target to end for clarity
         df = df[[c for c in df.columns if c != target_col] + [target_col]]
     cleaned_df, report = preprocessor.fit_transform(df)
-    # Create download link
     csv_bytes = cleaned_df.to_csv(index=False).encode()
     b64 = base64.b64encode(csv_bytes).decode()
     href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
     return cleaned_df, report, href
-# UI (Gradio)
-with gr.Blocks(title="DS Preprocessor Pro") as demo:
-    gr.Markdown("## 🚀 Data Science Preprocessor Pro\nUpload a messy CSV. Get back clean data + audit report.")
     with gr.Row():
         file_input = gr.File(label="Upload CSV", file_types=[".csv"])
-        target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., price")
-    with gr.Row():
-        go_btn = gr.Button("🔥 Clean My Data", variant="primary", size="lg")
     with gr.Tabs():
         with gr.TabItem("Cleaned Data"):
@@ -117,16 +144,17 @@ with gr.Blocks(title="DS Preprocessor Pro") as demo:
         with gr.TabItem("Download"):
             download_html = gr.HTML()
-    # Magic happens here
     go_btn.click(
         fn=process_file,
         inputs=[file_input, target_input],
         outputs=[data_output, report_output, download_html]
     )
-    gr.Examples(
-        examples=["sample_messy_data.csv"],  # Create a sample file in your Space
-        inputs=[file_input]
-    )
 demo.launch()

 # app.py - Deploy to Hugging Face Space (New → Gradio → Paste this)
+# app.py - Deploy-ready with auto-generated sample data
 import gradio as gr
 import pandas as pd
 import numpy as np
 import warnings
 warnings.filterwarnings('ignore')
+def generate_sample_data():
+    """Creates a messy CSV for demo purposes - runs on Space startup"""
+    np.random.seed(42)
+    n = 1000
+    df = pd.DataFrame({
+        'customer_id': [f'CUST_{i:05d}' for i in range(n)],  # High cardinality
+        'purchase_date': pd.date_range('2023-01-01', periods=n).strftime('%Y-%m-%d %H:%M:%S'),
+        'product_sku': np.random.choice([f'SKU_{i}' for i in range(800)], n),  # Another high-cardinality
+        'price': np.random.uniform(10, 1000, n).astype('float64'),  # Memory waste
+        'quantity': np.random.randint(1, 10, n).astype('int64'),  # More memory waste
+        'temperature': np.random.normal(20, 5, n),  # For interpolation demo
+        'category': np.random.choice(['A', 'B', 'C', None], n, p=[0.3, 0.3, 0.3, 0.1]),  # Missing values
+        'target': np.random.choice([0, 1], n)  # Binary target
+    })
+    # Introduce missingness that correlates with target (leakage)
+    df.loc[df['target'] == 1, 'temperature'] = np.nan
+    # Mess up datetime format for some rows
+    df.loc[::100, 'purchase_date'] = df.loc[::100, 'purchase_date'].str.replace(' ', 'T')
+    # Add constant column (silent failure)
+    df['version'] = 'v1.0'
+    df.to_csv('sample_messy_data.csv', index=False)
+    return 'sample_messy_data.csv'
 class DSPreprocessor:
     """Auto-fixes the 5 things that waste your time"""
             except:
                 pass
+        # 2. DateTime Hell: Auto-detect and parse
         for col in df.select_dtypes(include=['object']).columns:
             try:
                 parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
                 if parsed.notnull().sum() > len(df) * 0.3:
                     df[col] = parsed
             except:
                 pass
+        # 3. Categorical Explosion: Hash high-cardinality strings
         for col in df.select_dtypes(include=['object']).columns:
             n_unique = df[col].nunique()
             if n_unique > len(df) * 0.5:
                 df[col] = df[col].astype('category').cat.codes
                 self.report["warnings"].append(
+                    f"⚠️ {col}: {n_unique:,} unique values → Hashed to codes"
                 )
+        # 4. Missing Target Leakage: Flag correlated missingness
         missing_corr = df.isnull().corr()
         high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
         high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
                     f"⚠️ Missingness correlation: {row['level_0']} ↔ {row['level_1']} (r={row[0]:.2f})"
                 )
+        # 5. Silent Failures: Detect constant columns
         constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
         if constant_cols:
             self.report["warnings"].append(f"⚠️ Constant columns (drop these): {constant_cols}")
     df = pd.read_csv(file_obj.name)
     preprocessor = DSPreprocessor()
     if target_col and target_col in df.columns:
         df = df[[c for c in df.columns if c != target_col] + [target_col]]
     cleaned_df, report = preprocessor.fit_transform(df)
     csv_bytes = cleaned_df.to_csv(index=False).encode()
     b64 = base64.b64encode(csv_bytes).decode()
     href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
     return cleaned_df, report, href
+# Generate sample data on startup
+sample_file = generate_sample_data()
+# UI
+with gr.Blocks(title="DS AutoPrep") as demo:
+    gr.Markdown("# 🚀 DS AutoPrep\n**Zero-config CSV cleaning + leak detection + memory optimization**")
     with gr.Row():
         file_input = gr.File(label="Upload CSV", file_types=[".csv"])
+        target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., target")
+    go_btn = gr.Button("🔥 Clean My Data", variant="primary")
     with gr.Tabs():
         with gr.TabItem("Cleaned Data"):
         with gr.TabItem("Download"):
             download_html = gr.HTML()
+    # Auto-load example on startup
+    gr.Examples(
+        examples=[sample_file],
+        inputs=[file_input],
+        label="Try with sample data"
+    )
     go_btn.click(
         fn=process_file,
         inputs=[file_input, target_input],
         outputs=[data_output, report_output, download_html]
     )
 demo.launch()