Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
# app.py - Deploy to Hugging Face Space (New β Gradio β Paste this)
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
|
@@ -8,6 +9,34 @@ import base64
|
|
| 8 |
import warnings
|
| 9 |
warnings.filterwarnings('ignore')
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
class DSPreprocessor:
|
| 12 |
"""Auto-fixes the 5 things that waste your time"""
|
| 13 |
|
|
@@ -29,10 +58,9 @@ class DSPreprocessor:
|
|
| 29 |
except:
|
| 30 |
pass
|
| 31 |
|
| 32 |
-
# 2. DateTime Hell: Auto-detect and parse
|
| 33 |
for col in df.select_dtypes(include=['object']).columns:
|
| 34 |
try:
|
| 35 |
-
# Try parsing if >30% looks like dates
|
| 36 |
parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
|
| 37 |
if parsed.notnull().sum() > len(df) * 0.3:
|
| 38 |
df[col] = parsed
|
|
@@ -40,16 +68,16 @@ class DSPreprocessor:
|
|
| 40 |
except:
|
| 41 |
pass
|
| 42 |
|
| 43 |
-
# 3. Categorical Explosion: Hash high-cardinality strings
|
| 44 |
for col in df.select_dtypes(include=['object']).columns:
|
| 45 |
n_unique = df[col].nunique()
|
| 46 |
if n_unique > len(df) * 0.5:
|
| 47 |
df[col] = df[col].astype('category').cat.codes
|
| 48 |
self.report["warnings"].append(
|
| 49 |
-
f"β οΈ {col}: {n_unique:,} unique values β Hashed to codes
|
| 50 |
)
|
| 51 |
|
| 52 |
-
# 4. Missing Target Leakage: Flag
|
| 53 |
missing_corr = df.isnull().corr()
|
| 54 |
high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
|
| 55 |
high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
|
|
@@ -60,7 +88,7 @@ class DSPreprocessor:
|
|
| 60 |
f"β οΈ Missingness correlation: {row['level_0']} β {row['level_1']} (r={row[0]:.2f})"
|
| 61 |
)
|
| 62 |
|
| 63 |
-
# 5. Silent Failures: Detect constant columns
|
| 64 |
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
|
| 65 |
if constant_cols:
|
| 66 |
self.report["warnings"].append(f"β οΈ Constant columns (drop these): {constant_cols}")
|
|
@@ -84,30 +112,29 @@ def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, st
|
|
| 84 |
df = pd.read_csv(file_obj.name)
|
| 85 |
preprocessor = DSPreprocessor()
|
| 86 |
|
| 87 |
-
# Optional target column for leakage check
|
| 88 |
if target_col and target_col in df.columns:
|
| 89 |
-
# Move target to end for clarity
|
| 90 |
df = df[[c for c in df.columns if c != target_col] + [target_col]]
|
| 91 |
|
| 92 |
cleaned_df, report = preprocessor.fit_transform(df)
|
| 93 |
|
| 94 |
-
# Create download link
|
| 95 |
csv_bytes = cleaned_df.to_csv(index=False).encode()
|
| 96 |
b64 = base64.b64encode(csv_bytes).decode()
|
| 97 |
href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
|
| 98 |
|
| 99 |
return cleaned_df, report, href
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
with gr.Row():
|
| 106 |
file_input = gr.File(label="Upload CSV", file_types=[".csv"])
|
| 107 |
-
target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g.,
|
| 108 |
|
| 109 |
-
|
| 110 |
-
go_btn = gr.Button("π₯ Clean My Data", variant="primary", size="lg")
|
| 111 |
|
| 112 |
with gr.Tabs():
|
| 113 |
with gr.TabItem("Cleaned Data"):
|
|
@@ -117,16 +144,17 @@ with gr.Blocks(title="DS Preprocessor Pro") as demo:
|
|
| 117 |
with gr.TabItem("Download"):
|
| 118 |
download_html = gr.HTML()
|
| 119 |
|
| 120 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
go_btn.click(
|
| 122 |
fn=process_file,
|
| 123 |
inputs=[file_input, target_input],
|
| 124 |
outputs=[data_output, report_output, download_html]
|
| 125 |
)
|
| 126 |
-
|
| 127 |
-
gr.Examples(
|
| 128 |
-
examples=["sample_messy_data.csv"], # Create a sample file in your Space
|
| 129 |
-
inputs=[file_input]
|
| 130 |
-
)
|
| 131 |
|
| 132 |
demo.launch()
|
|
|
|
| 1 |
# app.py - Deploy to Hugging Face Space (New β Gradio β Paste this)
|
| 2 |
+
# app.py - Deploy-ready with auto-generated sample data
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
import numpy as np
|
|
|
|
| 9 |
import warnings
|
| 10 |
warnings.filterwarnings('ignore')
|
| 11 |
|
| 12 |
+
def generate_sample_data():
|
| 13 |
+
"""Creates a messy CSV for demo purposes - runs on Space startup"""
|
| 14 |
+
np.random.seed(42)
|
| 15 |
+
n = 1000
|
| 16 |
+
|
| 17 |
+
df = pd.DataFrame({
|
| 18 |
+
'customer_id': [f'CUST_{i:05d}' for i in range(n)], # High cardinality
|
| 19 |
+
'purchase_date': pd.date_range('2023-01-01', periods=n).strftime('%Y-%m-%d %H:%M:%S'),
|
| 20 |
+
'product_sku': np.random.choice([f'SKU_{i}' for i in range(800)], n), # Another high-cardinality
|
| 21 |
+
'price': np.random.uniform(10, 1000, n).astype('float64'), # Memory waste
|
| 22 |
+
'quantity': np.random.randint(1, 10, n).astype('int64'), # More memory waste
|
| 23 |
+
'temperature': np.random.normal(20, 5, n), # For interpolation demo
|
| 24 |
+
'category': np.random.choice(['A', 'B', 'C', None], n, p=[0.3, 0.3, 0.3, 0.1]), # Missing values
|
| 25 |
+
'target': np.random.choice([0, 1], n) # Binary target
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
# Introduce missingness that correlates with target (leakage)
|
| 29 |
+
df.loc[df['target'] == 1, 'temperature'] = np.nan
|
| 30 |
+
|
| 31 |
+
# Mess up datetime format for some rows
|
| 32 |
+
df.loc[::100, 'purchase_date'] = df.loc[::100, 'purchase_date'].str.replace(' ', 'T')
|
| 33 |
+
|
| 34 |
+
# Add constant column (silent failure)
|
| 35 |
+
df['version'] = 'v1.0'
|
| 36 |
+
|
| 37 |
+
df.to_csv('sample_messy_data.csv', index=False)
|
| 38 |
+
return 'sample_messy_data.csv'
|
| 39 |
+
|
| 40 |
class DSPreprocessor:
|
| 41 |
"""Auto-fixes the 5 things that waste your time"""
|
| 42 |
|
|
|
|
| 58 |
except:
|
| 59 |
pass
|
| 60 |
|
| 61 |
+
# 2. DateTime Hell: Auto-detect and parse
|
| 62 |
for col in df.select_dtypes(include=['object']).columns:
|
| 63 |
try:
|
|
|
|
| 64 |
parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
|
| 65 |
if parsed.notnull().sum() > len(df) * 0.3:
|
| 66 |
df[col] = parsed
|
|
|
|
| 68 |
except:
|
| 69 |
pass
|
| 70 |
|
| 71 |
+
# 3. Categorical Explosion: Hash high-cardinality strings
|
| 72 |
for col in df.select_dtypes(include=['object']).columns:
|
| 73 |
n_unique = df[col].nunique()
|
| 74 |
if n_unique > len(df) * 0.5:
|
| 75 |
df[col] = df[col].astype('category').cat.codes
|
| 76 |
self.report["warnings"].append(
|
| 77 |
+
f"β οΈ {col}: {n_unique:,} unique values β Hashed to codes"
|
| 78 |
)
|
| 79 |
|
| 80 |
+
# 4. Missing Target Leakage: Flag correlated missingness
|
| 81 |
missing_corr = df.isnull().corr()
|
| 82 |
high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
|
| 83 |
high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
|
|
|
|
| 88 |
f"β οΈ Missingness correlation: {row['level_0']} β {row['level_1']} (r={row[0]:.2f})"
|
| 89 |
)
|
| 90 |
|
| 91 |
+
# 5. Silent Failures: Detect constant columns
|
| 92 |
constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
|
| 93 |
if constant_cols:
|
| 94 |
self.report["warnings"].append(f"β οΈ Constant columns (drop these): {constant_cols}")
|
|
|
|
| 112 |
df = pd.read_csv(file_obj.name)
|
| 113 |
preprocessor = DSPreprocessor()
|
| 114 |
|
|
|
|
| 115 |
if target_col and target_col in df.columns:
|
|
|
|
| 116 |
df = df[[c for c in df.columns if c != target_col] + [target_col]]
|
| 117 |
|
| 118 |
cleaned_df, report = preprocessor.fit_transform(df)
|
| 119 |
|
|
|
|
| 120 |
csv_bytes = cleaned_df.to_csv(index=False).encode()
|
| 121 |
b64 = base64.b64encode(csv_bytes).decode()
|
| 122 |
href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
|
| 123 |
|
| 124 |
return cleaned_df, report, href
|
| 125 |
|
| 126 |
+
# Generate sample data on startup
|
| 127 |
+
sample_file = generate_sample_data()
|
| 128 |
+
|
| 129 |
+
# UI
|
| 130 |
+
with gr.Blocks(title="DS AutoPrep") as demo:
|
| 131 |
+
gr.Markdown("# π DS AutoPrep\n**Zero-config CSV cleaning + leak detection + memory optimization**")
|
| 132 |
|
| 133 |
with gr.Row():
|
| 134 |
file_input = gr.File(label="Upload CSV", file_types=[".csv"])
|
| 135 |
+
target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., target")
|
| 136 |
|
| 137 |
+
go_btn = gr.Button("π₯ Clean My Data", variant="primary")
|
|
|
|
| 138 |
|
| 139 |
with gr.Tabs():
|
| 140 |
with gr.TabItem("Cleaned Data"):
|
|
|
|
| 144 |
with gr.TabItem("Download"):
|
| 145 |
download_html = gr.HTML()
|
| 146 |
|
| 147 |
+
# Auto-load example on startup
|
| 148 |
+
gr.Examples(
|
| 149 |
+
examples=[sample_file],
|
| 150 |
+
inputs=[file_input],
|
| 151 |
+
label="Try with sample data"
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
go_btn.click(
|
| 155 |
fn=process_file,
|
| 156 |
inputs=[file_input, target_input],
|
| 157 |
outputs=[data_output, report_output, download_html]
|
| 158 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
demo.launch()
|