Asma-Abid commited on
Commit
82dec99
·
verified ·
1 Parent(s): 44e90dc

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +1127 -0
  2. random_forest.pkl +3 -0
  3. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,1127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import re
5
+ import io
6
+ import os
7
+ import joblib
8
+ import matplotlib
9
+ matplotlib.use("Agg")
10
+ import matplotlib.pyplot as plt
11
+ import seaborn as sns
12
+ from datetime import datetime
13
+
14
+ from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
15
+ from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
16
+ from sklearn.metrics import (
17
+ accuracy_score, confusion_matrix, silhouette_score,
18
+ classification_report, f1_score, precision_score, recall_score
19
+ )
20
+ from sklearn.ensemble import RandomForestClassifier
21
+ from sklearn.svm import SVC
22
+ from sklearn.linear_model import LogisticRegression
23
+ from sklearn.tree import DecisionTreeClassifier
24
+ from sklearn.cluster import KMeans
25
+ from sklearn.feature_selection import mutual_info_classif
26
+ from sklearn.utils import resample
27
+
28
+ # ==========================================================
29
+ # PAGE CONFIG
30
+ # ==========================================================
31
+ st.set_page_config(
32
+ page_title="AI AutoML Platform",
33
+ page_icon="🤖",
34
+ layout="wide"
35
+ )
36
+
37
+ # ==========================================================
38
+ # SESSION STATE
39
+ # ==========================================================
40
+ if "history" not in st.session_state:
41
+ st.session_state.history = []
42
+
43
+ if "last_model_name" not in st.session_state:
44
+ st.session_state.last_model_name = None
45
+
46
+ if "last_score" not in st.session_state:
47
+ st.session_state.last_score = None
48
+ #store detailed results per model run for reports
49
+ if "model_results" not in st.session_state:
50
+ st.session_state.model_results = []
51
+
52
+ #store selected target so report can reference it
53
+ if "selected_target" not in st.session_state:
54
+ st.session_state.selected_target = None
55
+
56
+ # store the cleaned df reference for report generation
57
+ if "cleaned_df" not in st.session_state:
58
+ st.session_state.cleaned_df = None
59
+
60
+ # ==========================================================
61
+ # THEME CSS
62
+ # ==========================================================
63
+ st.markdown("""
64
+ <style>
65
+ .stApp {
66
+ background: linear-gradient(135deg,#0f172a,#111827,#020617);
67
+ color: white;
68
+ }
69
+ .big-title {
70
+ font-size: 42px;
71
+ font-weight: 800;
72
+ color: #38bdf8;
73
+ text-align:center;
74
+ padding:15px;
75
+ }
76
+ .sub-title {
77
+ text-align:center;
78
+ color:#cbd5e1;
79
+ font-size:18px;
80
+ margin-bottom:25px;
81
+ }
82
+ .section {
83
+ background:#0f172a;
84
+ padding:12px;
85
+ border-radius:12px;
86
+ color:#38bdf8;
87
+ font-weight:700;
88
+ font-size:24px;
89
+ margin-top:20px;
90
+ }
91
+ .stButton>button {
92
+ background:#38bdf8;
93
+ color:black;
94
+ border:none;
95
+ border-radius:10px;
96
+ font-weight:700;
97
+ }
98
+ .stButton>button:hover {
99
+ background:#0ea5e9;
100
+ color:white;
101
+ }
102
+ div[data-baseweb="select"] > div {
103
+ background:#1e293b !important;
104
+ color:white !important;
105
+ }
106
+ .model-result-box {
107
+ background:#1e293b;
108
+ padding:20px;
109
+ border-radius:12px;
110
+ border:2px solid #38bdf8;
111
+ margin:15px 0;
112
+ }
113
+ /* File Uploader Button */
114
+ .stFileUploader>div>div>button {
115
+ background:#38bdf8 !important;
116
+ color:black !important;
117
+ border:none !important;
118
+ border-radius:10px !important;
119
+ font-weight:700 !important;
120
+ }
121
+ .stFileUploader>div>div>button:hover {
122
+ background:#0ea5e9 !important;
123
+ color:white !important;
124
+ }
125
+ /* File Uploader Button Alternative Selectors */
126
+ .stFileUploader button {
127
+ background:#38bdf8 !important;
128
+ color:black !important;
129
+ border:none !important;
130
+ border-radius:10px !important;
131
+ font-weight:700 !important;
132
+ }
133
+ .stFileUploader button:hover {
134
+ background:#0ea5e9 !important;
135
+ color:white !important;
136
+ }
137
+ /* Download Buttons */
138
+ .stDownloadButton>button {
139
+ background:#38bdf8 !important;
140
+ color:black !important;
141
+ border:none !important;
142
+ border-radius:10px !important;
143
+ font-weight:700 !important;
144
+ }
145
+ .stDownloadButton>button:hover {
146
+ background:#0ea5e9 !important;
147
+ color:white !important;
148
+ }
149
+ /* File Uploader Label */
150
+ .stFileUploader label {
151
+ color:#38bdf8 !important;
152
+ font-size:16px !important;
153
+ font-weight:700 !important;
154
+ }
155
+ /* Selectbox Labels */
156
+ .stSelectbox label {
157
+ color:#38bdf8 !important;
158
+ font-size:16px !important;
159
+ font-weight:700 !important;
160
+ }
161
+ /* Text and Write Styling */
162
+ p {
163
+ color:#cbd5e1;
164
+ }
165
+ .stWrite {
166
+ color:#cbd5e1;
167
+ }
168
+ /* Center pyplot figures and add lateral padding */
169
+ .stPlotlyChart, .stPyplot {
170
+ display: flex;
171
+ justify-content: center;
172
+ }
173
+ .stPyplot {
174
+ padding: 0 50px;
175
+ }
176
+ .stPlotlyChart {
177
+ padding: 0 50px;
178
+ }
179
+ /* Centered containers */
180
+ .stContainer {
181
+ max-width: 95%;
182
+ margin-left: auto;
183
+ margin-right: auto;
184
+ }
185
+ /* Classification Report Text */
186
+ .stText {
187
+ color: white !important;
188
+ }
189
+ .stText pre {
190
+ color: white !important;
191
+ }
192
+ .stText * {
193
+ color: white !important;
194
+ }
195
+ </style>
196
+ """, unsafe_allow_html=True)
197
+
198
+ # ==========================================================
199
+ # HEADER
200
+ # ==========================================================
201
+ st.markdown('<div class="big-title">🤖 AI AutoML Platform</div>', unsafe_allow_html=True)
202
+ st.markdown('<div class="sub-title">upload csv select model download trained model</div>', unsafe_allow_html=True)
203
+
204
+ # ==========================================================
205
+ # HELPERS
206
+ # ==========================================================
207
+ def smart_clean(df):
208
+ df = df.copy()
209
+ df = df.drop_duplicates()
210
+
211
+ for col in df.columns:
212
+ if df[col].dtype == "object":
213
+ df[col] = df[col].fillna(df[col].mode()[0])
214
+ else:
215
+ # use median instead of mean (more robust to outliers)
216
+ df[col] = df[col].fillna(df[col].median())
217
+
218
+ return df
219
+
220
+
221
+ def convert_units(value):
222
+ try:
223
+ txt = str(value).lower().strip()
224
+
225
+ nums = re.findall(r'[\d.]+', txt)
226
+ if not nums:
227
+ return value
228
+
229
+ num = float(nums[0])
230
+
231
+ if "km" in txt:
232
+ return num * 1000
233
+ elif "cm" in txt:
234
+ return num / 100
235
+ elif "mm" in txt:
236
+ return num / 1000
237
+ elif "m" in txt:
238
+ return num
239
+ else:
240
+ return num
241
+ except:
242
+ return value
243
+
244
+
245
+ def detect_unit_columns(df):
246
+ df = df.copy()
247
+
248
+ for col in df.columns:
249
+ if df[col].dtype == "object":
250
+ sample = str(df[col].iloc[0]).lower()
251
+
252
+ if any(x in sample for x in ["km", "cm", "mm", " m"]):
253
+ df[col] = df[col].apply(convert_units)
254
+
255
+ return df
256
+
257
+
258
+ def detect_best_target(df):
259
+ scores = {}
260
+
261
+ for col in df.columns:
262
+ score = 0
263
+ unique = df[col].nunique()
264
+ ratio = unique / len(df)
265
+
266
+ if 2 <= unique <= 15:
267
+ score += 6
268
+
269
+ if df[col].dtype == "object":
270
+ score += 3
271
+
272
+ if ratio > 0.9:
273
+ score -= 10
274
+
275
+ if unique > 50:
276
+ score -= 5
277
+
278
+ scores[col] = score
279
+
280
+ best = max(scores, key=scores.get)
281
+ ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
282
+
283
+ return best, ranked[:5]
284
+
285
+
286
+ def prepare_for_supervised(df, target):
287
+ data = df.copy()
288
+
289
+ for col in data.columns:
290
+ if data[col].dtype == "object":
291
+ le = LabelEncoder()
292
+ data[col] = le.fit_transform(data[col].astype(str))
293
+
294
+ X = data.drop(columns=[target])
295
+ y = data[target]
296
+
297
+ return X, y, data
298
+
299
+
300
+ # --- ACCURACY HELPER FUNCTIONS ---
301
+
302
+ def clip_outliers_iqr(df):
303
+ """Clip outliers using IQR method instead of removing rows."""
304
+ df = df.copy()
305
+ info = {}
306
+ for col in df.select_dtypes(include=[np.number]).columns:
307
+ Q1 = df[col].quantile(0.25)
308
+ Q3 = df[col].quantile(0.75)
309
+ IQR = Q3 - Q1
310
+ lower = Q1 - 1.5 * IQR
311
+ upper = Q3 + 1.5 * IQR
312
+ n_out = ((df[col] < lower) | (df[col] > upper)).sum()
313
+ if n_out > 0:
314
+ df[col] = df[col].clip(lower=lower, upper=upper)
315
+ info[col] = n_out
316
+ return df, info
317
+
318
+
319
+ def remove_low_variance(X, threshold=0.01):
320
+ """Remove features with near-zero variance."""
321
+ variances = X.var()
322
+ low = variances[variances < threshold].index.tolist()
323
+ if low:
324
+ X = X.drop(columns=low)
325
+ return X, low
326
+
327
+
328
+ def remove_high_correlation(X, threshold=0.95):
329
+ """Remove one of each pair of highly correlated features."""
330
+ corr = X.corr().abs()
331
+ upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
332
+ to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
333
+ if to_drop:
334
+ X = X.drop(columns=to_drop)
335
+ return X, to_drop
336
+
337
+
338
+ def balance_classes(X, y):
339
+ """Oversample minority classes to match majority count."""
340
+ classes, counts = np.unique(y, return_counts=True)
341
+ if len(classes) < 2:
342
+ return X, y, False
343
+
344
+ max_count = counts.max()
345
+ ratio = max_count / counts.min()
346
+ if ratio < 2:
347
+ return X, y, False
348
+
349
+ X_out = X.copy()
350
+ y_out = y.copy()
351
+
352
+ for cls, cnt in zip(classes, counts):
353
+ if cnt < max_count:
354
+ idx = y[y == cls].index
355
+ extra = resample(X.loc[idx], replace=True, n_samples=max_count - cnt, random_state=42)
356
+ y_extra = pd.Series([cls] * (max_count - cnt), index=extra.index)
357
+ X_out = pd.concat([X_out, extra])
358
+ y_out = pd.concat([y_out, y_extra])
359
+
360
+ return X_out, y_out, True
361
+
362
+
363
+ def select_top_features(X, y, max_features=20):
364
+ """Select top features by mutual information."""
365
+ if X.shape[1] <= max_features:
366
+ return X, list(X.columns)
367
+
368
+ mi = mutual_info_classif(X, y, random_state=42)
369
+ top = pd.Series(mi, index=X.columns).sort_values(ascending=False).head(max_features).index.tolist()
370
+ return X[top], top
371
+
372
+
373
+ def preprocess_for_model(df, target):
374
+ """Full accuracy-boosting preprocessing pipeline."""
375
+ X, y, transformed = prepare_for_supervised(df, target)
376
+
377
+ # Clip outliers
378
+ transformed_clipped, outlier_info = clip_outliers_iqr(transformed)
379
+ X = transformed_clipped.drop(columns=[target])
380
+ y = transformed_clipped[target]
381
+
382
+ # Remove low variance
383
+ X, low_var = remove_low_variance(X)
384
+
385
+ # Remove high correlation
386
+ X, high_corr = remove_high_correlation(X)
387
+
388
+ # Balance classes
389
+ X, y, balanced = balance_classes(X, y)
390
+
391
+ # Feature selection
392
+ X, selected = select_top_features(X, y)
393
+
394
+ return X, y, transformed, {
395
+ "outliers_clipped": outlier_info,
396
+ "low_var_removed": low_var,
397
+ "high_corr_removed": high_corr,
398
+ "class_balanced": balanced,
399
+ "features_used": list(X.columns),
400
+ }
401
+
402
+
403
+ def show_confusion(y_true, y_pred, title):
404
+ fig, ax = plt.subplots(figsize=(5,4))
405
+ cm = confusion_matrix(y_true, y_pred)
406
+
407
+ sns.heatmap(
408
+ cm,
409
+ annot=True,
410
+ fmt="d",
411
+ cmap="Blues",
412
+ linewidths=1
413
+ )
414
+
415
+ plt.title(title)
416
+ plt.xlabel("Predicted")
417
+ plt.ylabel("Actual")
418
+
419
+ col1, col2, col3 = st.columns([1, 2, 1])
420
+ with col2:
421
+ st.pyplot(fig)
422
+ return fig
423
+
424
+
425
+ def compact_bar(labels, values, title):
426
+ fig, ax = plt.subplots(figsize=(6,3))
427
+
428
+ sns.barplot(x=labels, y=values)
429
+
430
+ plt.xticks(rotation=20)
431
+ plt.title(title)
432
+
433
+ col1, col2, col3 = st.columns([1, 2, 1])
434
+ with col2:
435
+ st.pyplot(fig)
436
+ return fig
437
+
438
+
439
+ def save_result(name, score, target_col, features_used, extra_info=None):
440
+ """Enhanced save_result that stores all details for reporting."""
441
+ st.session_state.last_model_name = name
442
+ st.session_state.last_score = score
443
+
444
+ entry = {
445
+ "Model": name,
446
+ "Score": score,
447
+ "Target": target_col,
448
+ "Features": features_used,
449
+ "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
450
+ }
451
+
452
+ if extra_info:
453
+ entry.update(extra_info)
454
+
455
+ st.session_state.history.append(entry)
456
+ st.session_state.model_results.append(entry)
457
+
458
+
459
+ # --- REPORT GENERATORS ---
460
+
461
+ def generate_text_report(df, target, model_results):
462
+ """Generate a comprehensive TXT report with every detail."""
463
+ best = max(model_results, key=lambda x: x["Score"]) if model_results else None
464
+
465
+ lines = []
466
+ lines.append("=" * 70)
467
+ lines.append(" DARK AI AUTOML PLATFORM - FULL REPORT")
468
+ lines.append("=" * 70)
469
+ lines.append(f" Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
470
+ lines.append("")
471
+ lines.append("-" * 70)
472
+ lines.append(" DATASET SUMMARY")
473
+ lines.append("-" * 70)
474
+ lines.append(f" Rows: {df.shape[0]}")
475
+ lines.append(f" Columns: {df.shape[1]}")
476
+ lines.append(f" Target Column: {target}")
477
+ lines.append(f" Target Unique Values: {df[target].nunique()}")
478
+ lines.append("")
479
+
480
+ lines.append("-" * 70)
481
+ lines.append(" COLUMN DETAILS")
482
+ lines.append("-" * 70)
483
+ for col in df.columns:
484
+ dtype = str(df[col].dtype)
485
+ nunique = df[col].nunique()
486
+ missing = df[col].isnull().sum()
487
+ lines.append(f" {col}: type={dtype}, unique={nunique}, missing={missing}")
488
+ lines.append("")
489
+
490
+ lines.append("-" * 70)
491
+ lines.append(" MODEL RESULTS (ALL RUNS)")
492
+ lines.append("-" * 70)
493
+ for i, r in enumerate(model_results, 1):
494
+ lines.append("")
495
+ lines.append(f" Run #{i}")
496
+ lines.append(f" Model: {r['Model']}")
497
+ lines.append(f" Accuracy/Score: {r['Score']:.2f}%")
498
+ lines.append(f" Target Feature: {r.get('Target', 'N/A')}")
499
+ lines.append(f" Features Used: {r.get('Features', 'N/A')}")
500
+ lines.append(f" Timestamp: {r.get('Timestamp', 'N/A')}")
501
+ if "Precision" in r:
502
+ lines.append(f" Precision: {r['Precision']:.2f}%")
503
+ if "Recall" in r:
504
+ lines.append(f" Recall: {r['Recall']:.2f}%")
505
+ if "F1Score" in r:
506
+ lines.append(f" F1 Score: {r['F1Score']:.2f}%")
507
+ if "BestParams" in r:
508
+ lines.append(f" Best Hyperparameters: {r['BestParams']}")
509
+ if "OutliersClipped" in r:
510
+ lines.append(f" Outliers Clipped: {r['OutliersClipped']} columns")
511
+ if "LowVarRemoved" in r:
512
+ lines.append(f" Low Variance Features Removed: {r['LowVarRemoved']}")
513
+ if "HighCorrRemoved" in r:
514
+ lines.append(f" High Correlation Features Removed: {r['HighCorrRemoved']}")
515
+ if "ClassBalanced" in r:
516
+ lines.append(f" Class Balancing Applied: {r['ClassBalanced']}")
517
+ if "BestK" in r:
518
+ lines.append(f" Optimal Clusters (k): {r['BestK']}")
519
+
520
+ if best:
521
+ lines.append("")
522
+ lines.append("-" * 70)
523
+ lines.append(" BEST MODEL")
524
+ lines.append("-" * 70)
525
+ lines.append(f" Model: {best['Model']}")
526
+ lines.append(f" Score: {best['Score']:.2f}%")
527
+ lines.append(f" Target: {best.get('Target', 'N/A')}")
528
+
529
+ lines.append("")
530
+ lines.append("-" * 70)
531
+ lines.append(" PREPROCESSING PIPELINE")
532
+ lines.append("-" * 70)
533
+ lines.append(" - Duplicate removal")
534
+ lines.append(" - Missing values handled (median for numeric, mode for categorical)")
535
+ lines.append(" - Unit conversion (km/cm/mm -> m)")
536
+ lines.append(" - Categorical encoding (LabelEncoder)")
537
+ lines.append(" - Outlier clipping (IQR method)")
538
+ lines.append(" - Low variance feature removal")
539
+ lines.append(" - High correlation feature removal")
540
+ lines.append(" - Class imbalance handling (oversampling)")
541
+ lines.append(" - Feature selection (mutual information, top 20)")
542
+ lines.append(" - Scaling where required (StandardScaler / RobustScaler)")
543
+ lines.append(" - Hyperparameter tuning (GridSearchCV)")
544
+ lines.append(" - Stratified cross-validation (5-fold)")
545
+ lines.append("")
546
+ lines.append("=" * 70)
547
+ lines.append(" END OF REPORT")
548
+ lines.append("=" * 70)
549
+
550
+ return "\n".join(lines)
551
+
552
+
553
+ def generate_xlsx_report(df, target, model_results):
554
+ """Generate a multi-sheet XLSX report with every detail."""
555
+ output = io.BytesIO()
556
+
557
+ with pd.ExcelWriter(output, engine="openpyxl") as writer:
558
+ # Sheet 1: Dataset Summary
559
+ summary = pd.DataFrame({
560
+ "Property": ["Rows", "Columns", "Target Column", "Target Unique Values"],
561
+ "Value": [df.shape[0], df.shape[1], target, df[target].nunique()]
562
+ })
563
+ summary.to_excel(writer, sheet_name="Dataset Summary", index=False)
564
+
565
+ # Sheet 2: Column Details
566
+ col_details = []
567
+ for col in df.columns:
568
+ col_details.append({
569
+ "Column": col,
570
+ "Type": str(df[col].dtype),
571
+ "Unique Values": df[col].nunique(),
572
+ "Missing Values": df[col].isnull().sum(),
573
+ })
574
+ pd.DataFrame(col_details).to_excel(writer, sheet_name="Column Details", index=False)
575
+
576
+ # Sheet 3: Model Results
577
+ results_df = pd.DataFrame(model_results)
578
+ results_df.to_excel(writer, sheet_name="Model Results", index=False)
579
+
580
+ # Sheet 4: Best Model
581
+ if model_results:
582
+ best = max(model_results, key=lambda x: x["Score"])
583
+ pd.DataFrame([best]).to_excel(writer, sheet_name="Best Model", index=False)
584
+
585
+ output.seek(0)
586
+ return output
587
+
588
+
589
+ # ==========================================================
590
+ # UPLOAD
591
+ # ==========================================================
592
+ st.markdown('<div class="section">📁 Upload Dataset</div>', unsafe_allow_html=True)
593
+
594
+ file = st.file_uploader("Upload CSV File", type=["csv"])
595
+
596
+ # ==========================================================
597
+ # MAIN APP
598
+ # ==========================================================
599
+ if file:
600
+
601
+ raw = pd.read_csv(file)
602
+
603
+ st.markdown('<div class="section">📌 Dataset Preview</div>', unsafe_allow_html=True)
604
+ st.dataframe(raw.head(), use_container_width=True)
605
+
606
+ df = smart_clean(raw)
607
+ df = detect_unit_columns(df)
608
+
609
+ st.session_state.cleaned_df = df
610
+
611
+ # ------------------------------------------------------
612
+ # TARGET DETECTION
613
+ # ------------------------------------------------------
614
+ st.markdown('<div class="section">🎯 AI Target Detection</div>', unsafe_allow_html=True)
615
+
616
+ best_target, top5 = detect_best_target(df)
617
+
618
+ st.success(f"Recommended Target Column: {best_target}")
619
+
620
+ st.write("Top Suggestions:")
621
+
622
+ for n, s in top5:
623
+ st.write(f"• {n} (score: {s})")
624
+
625
+ # Dropdown with AI recommendation pre-selected, user can override
626
+ target = st.selectbox(
627
+ "Choose Target Column (AI recommended is pre-selected - change if needed)",
628
+ [best_target] + [c for c in df.columns if c != best_target]
629
+ )
630
+
631
+ st.session_state.selected_target = target
632
+
633
+ # ------------------------------------------------------
634
+ # MODEL SELECT
635
+ # ------------------------------------------------------
636
+ st.markdown('<div class="section">🤖 Choose Model</div>', unsafe_allow_html=True)
637
+
638
+ model_choice = st.selectbox(
639
+ "Select One Model",
640
+ [
641
+ "Random Forest",
642
+ "SVM",
643
+ "Logistic Regression",
644
+ "Decision Tree",
645
+ "KMeans Clustering"
646
+ ]
647
+ )
648
+
649
+ # ------------------------------------------------------
650
+ # APPLY MODEL
651
+ # ------------------------------------------------------
652
+ if st.button("🚀 Apply Model"):
653
+
654
+ # Each model result is in its own container so
655
+ # applying a second model shows results separately beneath the first
656
+
657
+ # RANDOM FOREST
658
+ if model_choice == "Random Forest":
659
+
660
+ X, y, transformed, pp_info = preprocess_for_model(df, target)
661
+ features_used = pp_info["features_used"]
662
+
663
+ result_box = st.container()
664
+ with result_box:
665
+ st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
666
+ st.markdown(f"### Random Forest Results (Target: {target})")
667
+
668
+ col1, col2 = st.columns(2)
669
+
670
+ with col1:
671
+ st.write("Original")
672
+ st.dataframe(raw.head())
673
+
674
+ with col2:
675
+ st.write("Processed")
676
+ st.dataframe(transformed.head())
677
+
678
+ X_train, X_test, y_train, y_test = train_test_split(
679
+ X, y, test_size=0.2, random_state=42, stratify=y
680
+ )
681
+
682
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
683
+
684
+ model = GridSearchCV(
685
+ RandomForestClassifier(),
686
+ {
687
+ "n_estimators":[100,200,300],
688
+ "max_depth":[5,10,15,None],
689
+ "min_samples_split":[2,5],
690
+ "min_samples_leaf":[1,2]
691
+ },
692
+ cv=cv,
693
+ n_jobs=-1
694
+ )
695
+
696
+ model.fit(X_train, y_train)
697
+
698
+ pred = model.predict(X_test)
699
+
700
+ acc = accuracy_score(y_test, pred)*100
701
+ prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
702
+ rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
703
+ f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
704
+
705
+ st.success(f"Accuracy: {acc:.2f}%")
706
+ st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
707
+
708
+ show_confusion(y_test, pred, "Random Forest Matrix")
709
+
710
+ imp = pd.Series(
711
+ model.best_estimator_.feature_importances_,
712
+ index=X.columns
713
+ ).sort_values(ascending=False).head(8)
714
+
715
+ compact_bar(imp.index, imp.values, "Feature Importance")
716
+
717
+ st.write("**Classification Report:**")
718
+ st.text(classification_report(y_test, pred, zero_division=0))
719
+
720
+ st.markdown('</div>', unsafe_allow_html=True)
721
+
722
+ joblib.dump(model.best_estimator_, "random_forest.pkl")
723
+
724
+ save_result("Random Forest", acc, target, ", ".join(features_used), {
725
+ "Precision": prec,
726
+ "Recall": rec,
727
+ "F1Score": f1,
728
+ "BestParams": str(model.best_params_),
729
+ "OutliersClipped": len(pp_info["outliers_clipped"]),
730
+ "LowVarRemoved": str(pp_info["low_var_removed"]),
731
+ "HighCorrRemoved": str(pp_info["high_corr_removed"]),
732
+ "ClassBalanced": pp_info["class_balanced"],
733
+ })
734
+
735
+ # SVM
736
+ elif model_choice == "SVM":
737
+
738
+ X, y, transformed, pp_info = preprocess_for_model(df, target)
739
+ features_used = pp_info["features_used"]
740
+
741
+ result_box = st.container()
742
+ with result_box:
743
+ st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
744
+ st.markdown(f"### SVM Results (Target: {target})")
745
+
746
+ X_train, X_test, y_train, y_test = train_test_split(
747
+ X, y, test_size=0.2, random_state=42, stratify=y
748
+ )
749
+
750
+ # RobustScaler for SVM (handles outliers better)
751
+ sc = RobustScaler()
752
+
753
+ X_train = sc.fit_transform(X_train)
754
+ X_test = sc.transform(X_test)
755
+
756
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
757
+
758
+ model = GridSearchCV(
759
+ SVC(),
760
+ {
761
+ "C":[0.1,1,10,100],
762
+ "kernel":["rbf","linear","poly"],
763
+ "gamma":["scale","auto"]
764
+ },
765
+ cv=cv,
766
+ n_jobs=-1
767
+ )
768
+
769
+ model.fit(X_train, y_train)
770
+
771
+ pred = model.predict(X_test)
772
+
773
+ acc = accuracy_score(y_test, pred)*100
774
+ prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
775
+ rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
776
+ f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
777
+
778
+ st.success(f"Accuracy: {acc:.2f}%")
779
+ st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
780
+
781
+ show_confusion(y_test, pred, "SVM Matrix")
782
+
783
+ st.write("**Classification Report:**")
784
+ st.text(classification_report(y_test, pred, zero_division=0))
785
+
786
+ st.markdown('</div>', unsafe_allow_html=True)
787
+
788
+ joblib.dump(model.best_estimator_, "svm.pkl")
789
+
790
+ save_result("SVM", acc, target, ", ".join(features_used), {
791
+ "Precision": prec,
792
+ "Recall": rec,
793
+ "F1Score": f1,
794
+ "BestParams": str(model.best_params_),
795
+ "OutliersClipped": len(pp_info["outliers_clipped"]),
796
+ "LowVarRemoved": str(pp_info["low_var_removed"]),
797
+ "HighCorrRemoved": str(pp_info["high_corr_removed"]),
798
+ "ClassBalanced": pp_info["class_balanced"],
799
+ })
800
+
801
+ # LOGISTIC
802
+ elif model_choice == "Logistic Regression":
803
+
804
+ X, y, transformed, pp_info = preprocess_for_model(df, target)
805
+ features_used = pp_info["features_used"]
806
+
807
+ result_box = st.container()
808
+ with result_box:
809
+ st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
810
+ st.markdown(f"### Logistic Regression Results (Target: {target})")
811
+
812
+ X_train, X_test, y_train, y_test = train_test_split(
813
+ X, y, test_size=0.2, random_state=42, stratify=y
814
+ )
815
+
816
+ sc = StandardScaler()
817
+
818
+ X_train = sc.fit_transform(X_train)
819
+ X_test = sc.transform(X_test)
820
+
821
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
822
+
823
+ model = GridSearchCV(
824
+ LogisticRegression(max_iter=5000, solver="liblinear"),
825
+ {
826
+ "C":[0.01,0.1,1,10,100],
827
+ "penalty":["l1","l2"]
828
+ },
829
+ cv=cv,
830
+ n_jobs=-1
831
+ )
832
+
833
+ model.fit(X_train, y_train)
834
+
835
+ pred = model.predict(X_test)
836
+
837
+ acc = accuracy_score(y_test, pred)*100
838
+ prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
839
+ rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
840
+ f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
841
+
842
+ st.success(f"Accuracy: {acc:.2f}%")
843
+ st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
844
+
845
+ show_confusion(y_test, pred, "Logistic Regression Matrix")
846
+
847
+ # Show coefficient magnitudes for logistic regression
848
+ if hasattr(model.best_estimator_, "coef_"):
849
+ coef = pd.Series(
850
+ np.abs(model.best_estimator_.coef_[0]),
851
+ index=X.columns
852
+ ).sort_values(ascending=False).head(8)
853
+ compact_bar(coef.index, coef.values, "Feature Coefficients (Absolute)")
854
+
855
+ st.write("**Classification Report:**")
856
+ st.text(classification_report(y_test, pred, zero_division=0))
857
+
858
+ st.markdown('</div>', unsafe_allow_html=True)
859
+
860
+ joblib.dump(model.best_estimator_, "logistic.pkl")
861
+
862
+ save_result("Logistic Regression", acc, target, ", ".join(features_used), {
863
+ "Precision": prec,
864
+ "Recall": rec,
865
+ "F1Score": f1,
866
+ "BestParams": str(model.best_params_),
867
+ "OutliersClipped": len(pp_info["outliers_clipped"]),
868
+ "LowVarRemoved": str(pp_info["low_var_removed"]),
869
+ "HighCorrRemoved": str(pp_info["high_corr_removed"]),
870
+ "ClassBalanced": pp_info["class_balanced"],
871
+ })
872
+
873
+ # DECISION TREE
874
+ elif model_choice == "Decision Tree":
875
+
876
+ X, y, transformed, pp_info = preprocess_for_model(df, target)
877
+ features_used = pp_info["features_used"]
878
+
879
+ result_box = st.container()
880
+ with result_box:
881
+ st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
882
+ st.markdown(f"### Decision Tree Results (Target: {target})")
883
+
884
+ X_train, X_test, y_train, y_test = train_test_split(
885
+ X, y, test_size=0.2, random_state=42, stratify=y
886
+ )
887
+
888
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
889
+
890
+ model = GridSearchCV(
891
+ DecisionTreeClassifier(),
892
+ {
893
+ "max_depth":[3,5,10,15,None],
894
+ "min_samples_split":[2,5,10],
895
+ "min_samples_leaf":[1,2,4],
896
+ "criterion":["gini","entropy"]
897
+ },
898
+ cv=cv,
899
+ n_jobs=-1
900
+ )
901
+
902
+ model.fit(X_train, y_train)
903
+
904
+ pred = model.predict(X_test)
905
+
906
+ acc = accuracy_score(y_test, pred)*100
907
+ prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
908
+ rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
909
+ f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
910
+
911
+ st.success(f"Accuracy: {acc:.2f}%")
912
+ st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
913
+
914
+ show_confusion(y_test, pred, "Decision Tree Matrix")
915
+
916
+ # Feature importance for decision tree
917
+ imp = pd.Series(
918
+ model.best_estimator_.feature_importances_,
919
+ index=X.columns
920
+ ).sort_values(ascending=False).head(8)
921
+ compact_bar(imp.index, imp.values, "Feature Importance")
922
+
923
+ st.write("**Classification Report:**")
924
+ st.text(classification_report(y_test, pred, zero_division=0))
925
+
926
+ st.markdown('</div>', unsafe_allow_html=True)
927
+
928
+ joblib.dump(model.best_estimator_, "decision_tree.pkl")
929
+
930
+ save_result("Decision Tree", acc, target, ", ".join(features_used), {
931
+ "Precision": prec,
932
+ "Recall": rec,
933
+ "F1Score": f1,
934
+ "BestParams": str(model.best_params_),
935
+ "OutliersClipped": len(pp_info["outliers_clipped"]),
936
+ "LowVarRemoved": str(pp_info["low_var_removed"]),
937
+ "HighCorrRemoved": str(pp_info["high_corr_removed"]),
938
+ "ClassBalanced": pp_info["class_balanced"],
939
+ })
940
+
941
+ # KMEANS
942
+ elif model_choice == "KMeans Clustering":
943
+
944
+ temp = df.copy()
945
+
946
+ for col in temp.columns:
947
+ if temp[col].dtype == "object":
948
+ le = LabelEncoder()
949
+ temp[col] = le.fit_transform(temp[col].astype(str))
950
+
951
+ X = temp.drop(columns=[target])
952
+
953
+ # Clip outliers for clustering too
954
+ temp_clipped, outlier_info = clip_outliers_iqr(temp)
955
+ X_clipped = temp_clipped.drop(columns=[target])
956
+
957
+ sc = StandardScaler()
958
+ Xs = sc.fit_transform(X_clipped)
959
+
960
+ # Find optimal k using elbow method
961
+ inertias = []
962
+ K_range = range(2, min(11, len(df) // 10 + 1))
963
+ for k in K_range:
964
+ km = KMeans(n_clusters=k, random_state=42, n_init=10)
965
+ km.fit(Xs)
966
+ inertias.append(km.inertia_)
967
+
968
+ best_k = 3
969
+ if len(inertias) >= 3:
970
+ diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
971
+ if diffs:
972
+ elbow_idx = np.argmax(diffs) + 1
973
+ best_k = list(K_range)[elbow_idx] if elbow_idx < len(list(K_range)) else 3
974
+ best_k = max(2, min(best_k, 10))
975
+
976
+ result_box = st.container()
977
+ with result_box:
978
+ st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
979
+ st.markdown(f"### KMeans Clustering Results (Target: {target})")
980
+
981
+ model = KMeans(n_clusters=best_k, random_state=42, n_init=10)
982
+
983
+ cluster = model.fit_predict(Xs)
984
+
985
+ score = silhouette_score(Xs, cluster)*100
986
+
987
+ st.success(f"Cluster Quality Score: {score:.2f}% (k={best_k})")
988
+
989
+ fig, ax = plt.subplots(figsize=(6,4))
990
+ plt.scatter(Xs[:,0], Xs[:,1], c=cluster, cmap="viridis")
991
+ plt.title(f"Clusters (k={best_k})")
992
+ col1, col2, col3 = st.columns([1, 2, 1])
993
+ with col2:
994
+ st.pyplot(fig)
995
+
996
+ # Elbow plot
997
+ fig2, ax2 = plt.subplots(figsize=(6,3))
998
+ plt.plot(list(K_range), inertias, "bo-")
999
+ plt.xlabel("Number of Clusters (k)")
1000
+ plt.ylabel("Inertia")
1001
+ plt.title("Elbow Method")
1002
+ col1, col2, col3 = st.columns([1, 2, 1])
1003
+ with col2:
1004
+ st.pyplot(fig2)
1005
+
1006
+ # Cluster distribution
1007
+ cluster_counts = pd.Series(cluster).value_counts().sort_index()
1008
+ fig3, ax3 = plt.subplots(figsize=(6,3))
1009
+ sns.barplot(x=cluster_counts.index, y=cluster_counts.values)
1010
+ plt.xlabel("Cluster")
1011
+ plt.ylabel("Count")
1012
+ plt.title("Cluster Distribution")
1013
+ col1, col2, col3 = st.columns([1, 2, 1])
1014
+ with col2:
1015
+ st.pyplot(fig3)
1016
+
1017
+ st.markdown('</div>', unsafe_allow_html=True)
1018
+
1019
+ joblib.dump(model, "kmeans.pkl")
1020
+
1021
+ save_result("KMeans Clustering", score, target, ", ".join(X_clipped.columns), {
1022
+ "BestK": best_k,
1023
+ "OutliersClipped": len(outlier_info),
1024
+ })
1025
+
1026
+ # ==========================================================
1027
+ # DOWNLOAD SECTION
1028
+ # ==========================================================
1029
+ if st.session_state.last_model_name:
1030
+
1031
+ st.markdown('<div class="section">⬇ Downloads</div>', unsafe_allow_html=True)
1032
+
1033
+ file_map = {
1034
+ "Random Forest":"random_forest.pkl",
1035
+ "SVM":"svm.pkl",
1036
+ "Logistic Regression":"logistic.pkl",
1037
+ "Decision Tree":"decision_tree.pkl",
1038
+ "KMeans Clustering":"kmeans.pkl"
1039
+ }
1040
+
1041
+ current = file_map[st.session_state.last_model_name]
1042
+
1043
+ if os.path.exists(current):
1044
+
1045
+ with open(current, "rb") as f:
1046
+ st.download_button(
1047
+ label=f"Download {st.session_state.last_model_name} (Deploy Ready)",
1048
+ data=f,
1049
+ file_name=current,
1050
+ mime="application/octet-stream"
1051
+ )
1052
+
1053
+ # ==========================================================
1054
+ # HISTORY + REPORTS
1055
+ # ==========================================================
1056
+ if len(st.session_state.history) > 0:
1057
+
1058
+ st.markdown('<div class="section">📊 History</div>', unsafe_allow_html=True)
1059
+
1060
+ hist = pd.DataFrame(st.session_state.history)
1061
+
1062
+ st.dataframe(hist, use_container_width=True)
1063
+
1064
+ fig, ax = plt.subplots(figsize=(6,3))
1065
+ sns.barplot(data=hist, x="Model", y="Score")
1066
+ plt.xticks(rotation=20)
1067
+ plt.title("All Applied Models")
1068
+ col1, col2, col3 = st.columns([1, 2, 1])
1069
+ with col2:
1070
+ st.pyplot(fig)
1071
+
1072
+ # CSV
1073
+ csv_buffer = io.StringIO()
1074
+ hist.to_csv(csv_buffer, index=False)
1075
+
1076
+ st.download_button(
1077
+ "Download Results CSV",
1078
+ csv_buffer.getvalue(),
1079
+ "results.csv"
1080
+ )
1081
+
1082
+ # TXT report
1083
+ if st.session_state.cleaned_df is not None and len(st.session_state.model_results) > 0:
1084
+ report_text = generate_text_report(
1085
+ st.session_state.cleaned_df,
1086
+ st.session_state.selected_target or "unknown",
1087
+ st.session_state.model_results
1088
+ )
1089
+
1090
+ st.download_button(
1091
+ "Download Full Report (TXT)",
1092
+ report_text,
1093
+ "full_report.txt",
1094
+ mime="text/plain"
1095
+ )
1096
+
1097
+ # XLSX report
1098
+ try:
1099
+ xlsx_data = generate_xlsx_report(
1100
+ st.session_state.cleaned_df,
1101
+ st.session_state.selected_target or "unknown",
1102
+ st.session_state.model_results
1103
+ )
1104
+ st.download_button(
1105
+ "Download Full Report (XLSX)",
1106
+ data=xlsx_data.getvalue(),
1107
+ file_name="full_report.xlsx",
1108
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
1109
+ )
1110
+ except Exception:
1111
+ pass
1112
+
1113
+ # ==========================================================
1114
+ # RESET
1115
+ # ==========================================================
1116
+ st.markdown('<div class="section">♻ Reset</div>', unsafe_allow_html=True)
1117
+
1118
+ if st.button("Clear History"):
1119
+
1120
+ st.session_state.history = []
1121
+ st.session_state.last_model_name = None
1122
+ st.session_state.last_score = None
1123
+ st.session_state.model_results = []
1124
+ st.session_state.selected_target = None
1125
+ st.session_state.cleaned_df = None
1126
+
1127
+ st.success("History Cleared")
random_forest.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aa8be408ccf6fb6ec8b9937082a4d9db1b9129c3d2b1c462377ba172ae805b2
3
+ size 2105289
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+
3
+ streamlit
4
+ pandas
5
+ numpy
6
+ matplotlib
7
+ seaborn
8
+ scikit-learn
9
+ joblib
10
+ python-docx
11
+ python-pptx
12
+ openpyxl