omasteam commited on
Commit
4a7095a
Β·
verified Β·
1 Parent(s): 0ab082c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -21
app.py CHANGED
@@ -1,4 +1,5 @@
1
  # app.py - Deploy to Hugging Face Space (New β†’ Gradio β†’ Paste this)
 
2
  import gradio as gr
3
  import pandas as pd
4
  import numpy as np
@@ -8,6 +9,34 @@ import base64
8
  import warnings
9
  warnings.filterwarnings('ignore')
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  class DSPreprocessor:
12
  """Auto-fixes the 5 things that waste your time"""
13
 
@@ -29,10 +58,9 @@ class DSPreprocessor:
29
  except:
30
  pass
31
 
32
- # 2. DateTime Hell: Auto-detect and parse (handles 3 formats in one column)
33
  for col in df.select_dtypes(include=['object']).columns:
34
  try:
35
- # Try parsing if >30% looks like dates
36
  parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
37
  if parsed.notnull().sum() > len(df) * 0.3:
38
  df[col] = parsed
@@ -40,16 +68,16 @@ class DSPreprocessor:
40
  except:
41
  pass
42
 
43
- # 3. Categorical Explosion: Hash high-cardinality strings (prevents memory blowup)
44
  for col in df.select_dtypes(include=['object']).columns:
45
  n_unique = df[col].nunique()
46
  if n_unique > len(df) * 0.5:
47
  df[col] = df[col].astype('category').cat.codes
48
  self.report["warnings"].append(
49
- f"⚠️ {col}: {n_unique:,} unique values β†’ Hashed to codes (category leak risk)"
50
  )
51
 
52
- # 4. Missing Target Leakage: Flag if missingness correlates with any column
53
  missing_corr = df.isnull().corr()
54
  high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
55
  high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
@@ -60,7 +88,7 @@ class DSPreprocessor:
60
  f"⚠️ Missingness correlation: {row['level_0']} ↔ {row['level_1']} (r={row[0]:.2f})"
61
  )
62
 
63
- # 5. Silent Failures: Detect constant columns (screw up scaling)
64
  constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
65
  if constant_cols:
66
  self.report["warnings"].append(f"⚠️ Constant columns (drop these): {constant_cols}")
@@ -84,30 +112,29 @@ def process_file(file_obj, target_col: str = "") -> Tuple[pd.DataFrame, Dict, st
84
  df = pd.read_csv(file_obj.name)
85
  preprocessor = DSPreprocessor()
86
 
87
- # Optional target column for leakage check
88
  if target_col and target_col in df.columns:
89
- # Move target to end for clarity
90
  df = df[[c for c in df.columns if c != target_col] + [target_col]]
91
 
92
  cleaned_df, report = preprocessor.fit_transform(df)
93
 
94
- # Create download link
95
  csv_bytes = cleaned_df.to_csv(index=False).encode()
96
  b64 = base64.b64encode(csv_bytes).decode()
97
  href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
98
 
99
  return cleaned_df, report, href
100
 
101
- # UI (Gradio)
102
- with gr.Blocks(title="DS Preprocessor Pro") as demo:
103
- gr.Markdown("## πŸš€ Data Science Preprocessor Pro\nUpload a messy CSV. Get back clean data + audit report.")
 
 
 
104
 
105
  with gr.Row():
106
  file_input = gr.File(label="Upload CSV", file_types=[".csv"])
107
- target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., price")
108
 
109
- with gr.Row():
110
- go_btn = gr.Button("πŸ”₯ Clean My Data", variant="primary", size="lg")
111
 
112
  with gr.Tabs():
113
  with gr.TabItem("Cleaned Data"):
@@ -117,16 +144,17 @@ with gr.Blocks(title="DS Preprocessor Pro") as demo:
117
  with gr.TabItem("Download"):
118
  download_html = gr.HTML()
119
 
120
- # Magic happens here
 
 
 
 
 
 
121
  go_btn.click(
122
  fn=process_file,
123
  inputs=[file_input, target_input],
124
  outputs=[data_output, report_output, download_html]
125
  )
126
-
127
- gr.Examples(
128
- examples=["sample_messy_data.csv"], # Create a sample file in your Space
129
- inputs=[file_input]
130
- )
131
 
132
  demo.launch()
 
1
  # app.py - Deploy to Hugging Face Space (New β†’ Gradio β†’ Paste this)
2
+ # app.py - Deploy-ready with auto-generated sample data
3
  import gradio as gr
4
  import pandas as pd
5
  import numpy as np
 
9
  import warnings
10
  warnings.filterwarnings('ignore')
11
 
12
+ def generate_sample_data():
13
+ """Creates a messy CSV for demo purposes - runs on Space startup"""
14
+ np.random.seed(42)
15
+ n = 1000
16
+
17
+ df = pd.DataFrame({
18
+ 'customer_id': [f'CUST_{i:05d}' for i in range(n)], # High cardinality
19
+ 'purchase_date': pd.date_range('2023-01-01', periods=n).strftime('%Y-%m-%d %H:%M:%S'),
20
+ 'product_sku': np.random.choice([f'SKU_{i}' for i in range(800)], n), # Another high-cardinality
21
+ 'price': np.random.uniform(10, 1000, n).astype('float64'), # Memory waste
22
+ 'quantity': np.random.randint(1, 10, n).astype('int64'), # More memory waste
23
+ 'temperature': np.random.normal(20, 5, n), # For interpolation demo
24
+ 'category': np.random.choice(['A', 'B', 'C', None], n, p=[0.3, 0.3, 0.3, 0.1]), # Missing values
25
+ 'target': np.random.choice([0, 1], n) # Binary target
26
+ })
27
+
28
+ # Introduce missingness that correlates with target (leakage)
29
+ df.loc[df['target'] == 1, 'temperature'] = np.nan
30
+
31
+ # Mess up datetime format for some rows
32
+ df.loc[::100, 'purchase_date'] = df.loc[::100, 'purchase_date'].str.replace(' ', 'T')
33
+
34
+ # Add constant column (silent failure)
35
+ df['version'] = 'v1.0'
36
+
37
+ df.to_csv('sample_messy_data.csv', index=False)
38
+ return 'sample_messy_data.csv'
39
+
40
  class DSPreprocessor:
41
  """Auto-fixes the 5 things that waste your time"""
42
 
 
58
  except:
59
  pass
60
 
61
+ # 2. DateTime Hell: Auto-detect and parse
62
  for col in df.select_dtypes(include=['object']).columns:
63
  try:
 
64
  parsed = pd.to_datetime(df[col], errors='coerce', infer_datetime_format=True)
65
  if parsed.notnull().sum() > len(df) * 0.3:
66
  df[col] = parsed
 
68
  except:
69
  pass
70
 
71
+ # 3. Categorical Explosion: Hash high-cardinality strings
72
  for col in df.select_dtypes(include=['object']).columns:
73
  n_unique = df[col].nunique()
74
  if n_unique > len(df) * 0.5:
75
  df[col] = df[col].astype('category').cat.codes
76
  self.report["warnings"].append(
77
+ f"⚠️ {col}: {n_unique:,} unique values β†’ Hashed to codes"
78
  )
79
 
80
+ # 4. Missing Target Leakage: Flag correlated missingness
81
  missing_corr = df.isnull().corr()
82
  high_corr = missing_corr[missing_corr.abs() > 0.9].stack().reset_index()
83
  high_corr = high_corr[high_corr['level_0'] != high_corr['level_1']]
 
88
  f"⚠️ Missingness correlation: {row['level_0']} ↔ {row['level_1']} (r={row[0]:.2f})"
89
  )
90
 
91
+ # 5. Silent Failures: Detect constant columns
92
  constant_cols = [col for col in df.columns if df[col].nunique() <= 1]
93
  if constant_cols:
94
  self.report["warnings"].append(f"⚠️ Constant columns (drop these): {constant_cols}")
 
112
  df = pd.read_csv(file_obj.name)
113
  preprocessor = DSPreprocessor()
114
 
 
115
  if target_col and target_col in df.columns:
 
116
  df = df[[c for c in df.columns if c != target_col] + [target_col]]
117
 
118
  cleaned_df, report = preprocessor.fit_transform(df)
119
 
 
120
  csv_bytes = cleaned_df.to_csv(index=False).encode()
121
  b64 = base64.b64encode(csv_bytes).decode()
122
  href = f'<a href="data:file/csv;base64,{b64}" download="cleaned_data.csv">Download Cleaned CSV</a>'
123
 
124
  return cleaned_df, report, href
125
 
126
+ # Generate sample data on startup
127
+ sample_file = generate_sample_data()
128
+
129
+ # UI
130
+ with gr.Blocks(title="DS AutoPrep") as demo:
131
+ gr.Markdown("# πŸš€ DS AutoPrep\n**Zero-config CSV cleaning + leak detection + memory optimization**")
132
 
133
  with gr.Row():
134
  file_input = gr.File(label="Upload CSV", file_types=[".csv"])
135
+ target_input = gr.Textbox(label="Target column (optional)", placeholder="e.g., target")
136
 
137
+ go_btn = gr.Button("πŸ”₯ Clean My Data", variant="primary")
 
138
 
139
  with gr.Tabs():
140
  with gr.TabItem("Cleaned Data"):
 
144
  with gr.TabItem("Download"):
145
  download_html = gr.HTML()
146
 
147
+ # Auto-load example on startup
148
+ gr.Examples(
149
+ examples=[sample_file],
150
+ inputs=[file_input],
151
+ label="Try with sample data"
152
+ )
153
+
154
  go_btn.click(
155
  fn=process_file,
156
  inputs=[file_input, target_input],
157
  outputs=[data_output, report_output, download_html]
158
  )
 
 
 
 
 
159
 
160
  demo.launch()