Asma-Abid
/

Random-Forest

+import streamlit as st
+import pandas as pd
+import numpy as np
+import re
+import io
+import os
+import joblib
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import seaborn as sns
+from datetime import datetime
+from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
+from sklearn.preprocessing import LabelEncoder, StandardScaler, RobustScaler
+from sklearn.metrics import (
+    accuracy_score, confusion_matrix, silhouette_score,
+    classification_report, f1_score, precision_score, recall_score
+)
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.cluster import KMeans
+from sklearn.feature_selection import mutual_info_classif
+from sklearn.utils import resample
+# ==========================================================
+# PAGE CONFIG
+# ==========================================================
+st.set_page_config(
+    page_title="AI AutoML Platform",
+    page_icon="🤖",
+    layout="wide"
+)
+# ==========================================================
+# SESSION STATE
+# ==========================================================
+if "history" not in st.session_state:
+    st.session_state.history = []
+if "last_model_name" not in st.session_state:
+    st.session_state.last_model_name = None
+if "last_score" not in st.session_state:
+    st.session_state.last_score = None
+#store detailed results per model run for reports
+if "model_results" not in st.session_state:
+    st.session_state.model_results = []
+#store selected target so report can reference it
+if "selected_target" not in st.session_state:
+    st.session_state.selected_target = None
+# store the cleaned df reference for report generation
+if "cleaned_df" not in st.session_state:
+    st.session_state.cleaned_df = None
+# ==========================================================
+# THEME CSS
+# ==========================================================
+st.markdown("""
+<style>
+.stApp {
+    background: linear-gradient(135deg,#0f172a,#111827,#020617);
+    color: white;
+}
+.big-title {
+    font-size: 42px;
+    font-weight: 800;
+    color: #38bdf8;
+    text-align:center;
+    padding:15px;
+}
+.sub-title {
+    text-align:center;
+    color:#cbd5e1;
+    font-size:18px;
+    margin-bottom:25px;
+}
+.section {
+    background:#0f172a;
+    padding:12px;
+    border-radius:12px;
+    color:#38bdf8;
+    font-weight:700;
+    font-size:24px;
+    margin-top:20px;
+}
+.stButton>button {
+    background:#38bdf8;
+    color:black;
+    border:none;
+    border-radius:10px;
+    font-weight:700;
+}
+.stButton>button:hover {
+    background:#0ea5e9;
+    color:white;
+}
+div[data-baseweb="select"] > div {
+    background:#1e293b !important;
+    color:white !important;
+}
+.model-result-box {
+    background:#1e293b;
+    padding:20px;
+    border-radius:12px;
+    border:2px solid #38bdf8;
+    margin:15px 0;
+}
+/* File Uploader Button */
+.stFileUploader>div>div>button {
+    background:#38bdf8 !important;
+    color:black !important;
+    border:none !important;
+    border-radius:10px !important;
+    font-weight:700 !important;
+}
+.stFileUploader>div>div>button:hover {
+    background:#0ea5e9 !important;
+    color:white !important;
+}
+/* File Uploader Button Alternative Selectors */
+.stFileUploader button {
+    background:#38bdf8 !important;
+    color:black !important;
+    border:none !important;
+    border-radius:10px !important;
+    font-weight:700 !important;
+}
+.stFileUploader button:hover {
+    background:#0ea5e9 !important;
+    color:white !important;
+}
+/* Download Buttons */
+.stDownloadButton>button {
+    background:#38bdf8 !important;
+    color:black !important;
+    border:none !important;
+    border-radius:10px !important;
+    font-weight:700 !important;
+}
+.stDownloadButton>button:hover {
+    background:#0ea5e9 !important;
+    color:white !important;
+}
+/* File Uploader Label */
+.stFileUploader label {
+    color:#38bdf8 !important;
+    font-size:16px !important;
+    font-weight:700 !important;
+}
+/* Selectbox Labels */
+.stSelectbox label {
+    color:#38bdf8 !important;
+    font-size:16px !important;
+    font-weight:700 !important;
+}
+/* Text and Write Styling */
+p {
+    color:#cbd5e1;
+}
+.stWrite {
+    color:#cbd5e1;
+}
+/* Center pyplot figures and add lateral padding */
+.stPlotlyChart, .stPyplot {
+    display: flex;
+    justify-content: center;
+}
+.stPyplot {
+    padding: 0 50px;
+}
+.stPlotlyChart {
+    padding: 0 50px;
+}
+/* Centered containers */
+.stContainer {
+    max-width: 95%;
+    margin-left: auto;
+    margin-right: auto;
+}
+/* Classification Report Text */
+.stText {
+    color: white !important;
+}
+.stText pre {
+    color: white !important;
+}
+.stText * {
+    color: white !important;
+}
+</style>
+""", unsafe_allow_html=True)
+# ==========================================================
+# HEADER
+# ==========================================================
+st.markdown('<div class="big-title">🤖 AI AutoML Platform</div>', unsafe_allow_html=True)
+st.markdown('<div class="sub-title">upload csv select model download trained model</div>', unsafe_allow_html=True)
+# ==========================================================
+# HELPERS
+# ==========================================================
+def smart_clean(df):
+    df = df.copy()
+    df = df.drop_duplicates()
+    for col in df.columns:
+        if df[col].dtype == "object":
+            df[col] = df[col].fillna(df[col].mode()[0])
+        else:
+            # use median instead of mean (more robust to outliers)
+            df[col] = df[col].fillna(df[col].median())
+    return df
+def convert_units(value):
+    try:
+        txt = str(value).lower().strip()
+        nums = re.findall(r'[\d.]+', txt)
+        if not nums:
+            return value
+        num = float(nums[0])
+        if "km" in txt:
+            return num * 1000
+        elif "cm" in txt:
+            return num / 100
+        elif "mm" in txt:
+            return num / 1000
+        elif "m" in txt:
+            return num
+        else:
+            return num
+    except:
+        return value
+def detect_unit_columns(df):
+    df = df.copy()
+    for col in df.columns:
+        if df[col].dtype == "object":
+            sample = str(df[col].iloc[0]).lower()
+            if any(x in sample for x in ["km", "cm", "mm", " m"]):
+                df[col] = df[col].apply(convert_units)
+    return df
+def detect_best_target(df):
+    scores = {}
+    for col in df.columns:
+        score = 0
+        unique = df[col].nunique()
+        ratio = unique / len(df)
+        if 2 <= unique <= 15:
+            score += 6
+        if df[col].dtype == "object":
+            score += 3
+        if ratio > 0.9:
+            score -= 10
+        if unique > 50:
+            score -= 5
+        scores[col] = score
+    best = max(scores, key=scores.get)
+    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
+    return best, ranked[:5]
+def prepare_for_supervised(df, target):
+    data = df.copy()
+    for col in data.columns:
+        if data[col].dtype == "object":
+            le = LabelEncoder()
+            data[col] = le.fit_transform(data[col].astype(str))
+    X = data.drop(columns=[target])
+    y = data[target]
+    return X, y, data
+# --- ACCURACY HELPER FUNCTIONS ---
+def clip_outliers_iqr(df):
+    """Clip outliers using IQR method instead of removing rows."""
+    df = df.copy()
+    info = {}
+    for col in df.select_dtypes(include=[np.number]).columns:
+        Q1 = df[col].quantile(0.25)
+        Q3 = df[col].quantile(0.75)
+        IQR = Q3 - Q1
+        lower = Q1 - 1.5 * IQR
+        upper = Q3 + 1.5 * IQR
+        n_out = ((df[col] < lower) | (df[col] > upper)).sum()
+        if n_out > 0:
+            df[col] = df[col].clip(lower=lower, upper=upper)
+            info[col] = n_out
+    return df, info
+def remove_low_variance(X, threshold=0.01):
+    """Remove features with near-zero variance."""
+    variances = X.var()
+    low = variances[variances < threshold].index.tolist()
+    if low:
+        X = X.drop(columns=low)
+    return X, low
+def remove_high_correlation(X, threshold=0.95):
+    """Remove one of each pair of highly correlated features."""
+    corr = X.corr().abs()
+    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
+    to_drop = [c for c in upper.columns if any(upper[c] > threshold)]
+    if to_drop:
+        X = X.drop(columns=to_drop)
+    return X, to_drop
+def balance_classes(X, y):
+    """Oversample minority classes to match majority count."""
+    classes, counts = np.unique(y, return_counts=True)
+    if len(classes) < 2:
+        return X, y, False
+    max_count = counts.max()
+    ratio = max_count / counts.min()
+    if ratio < 2:
+        return X, y, False
+    X_out = X.copy()
+    y_out = y.copy()
+    for cls, cnt in zip(classes, counts):
+        if cnt < max_count:
+            idx = y[y == cls].index
+            extra = resample(X.loc[idx], replace=True, n_samples=max_count - cnt, random_state=42)
+            y_extra = pd.Series([cls] * (max_count - cnt), index=extra.index)
+            X_out = pd.concat([X_out, extra])
+            y_out = pd.concat([y_out, y_extra])
+    return X_out, y_out, True
+def select_top_features(X, y, max_features=20):
+    """Select top features by mutual information."""
+    if X.shape[1] <= max_features:
+        return X, list(X.columns)
+    mi = mutual_info_classif(X, y, random_state=42)
+    top = pd.Series(mi, index=X.columns).sort_values(ascending=False).head(max_features).index.tolist()
+    return X[top], top
+def preprocess_for_model(df, target):
+    """Full accuracy-boosting preprocessing pipeline."""
+    X, y, transformed = prepare_for_supervised(df, target)
+    # Clip outliers
+    transformed_clipped, outlier_info = clip_outliers_iqr(transformed)
+    X = transformed_clipped.drop(columns=[target])
+    y = transformed_clipped[target]
+    # Remove low variance
+    X, low_var = remove_low_variance(X)
+    # Remove high correlation
+    X, high_corr = remove_high_correlation(X)
+    # Balance classes
+    X, y, balanced = balance_classes(X, y)
+    # Feature selection
+    X, selected = select_top_features(X, y)
+    return X, y, transformed, {
+        "outliers_clipped": outlier_info,
+        "low_var_removed": low_var,
+        "high_corr_removed": high_corr,
+        "class_balanced": balanced,
+        "features_used": list(X.columns),
+    }
+def show_confusion(y_true, y_pred, title):
+    fig, ax = plt.subplots(figsize=(5,4))
+    cm = confusion_matrix(y_true, y_pred)
+    sns.heatmap(
+        cm,
+        annot=True,
+        fmt="d",
+        cmap="Blues",
+        linewidths=1
+    )
+    plt.title(title)
+    plt.xlabel("Predicted")
+    plt.ylabel("Actual")
+    col1, col2, col3 = st.columns([1, 2, 1])
+    with col2:
+        st.pyplot(fig)
+    return fig
+def compact_bar(labels, values, title):
+    fig, ax = plt.subplots(figsize=(6,3))
+    sns.barplot(x=labels, y=values)
+    plt.xticks(rotation=20)
+    plt.title(title)
+    col1, col2, col3 = st.columns([1, 2, 1])
+    with col2:
+        st.pyplot(fig)
+    return fig
+def save_result(name, score, target_col, features_used, extra_info=None):
+    """Enhanced save_result that stores all details for reporting."""
+    st.session_state.last_model_name = name
+    st.session_state.last_score = score
+    entry = {
+        "Model": name,
+        "Score": score,
+        "Target": target_col,
+        "Features": features_used,
+        "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    if extra_info:
+        entry.update(extra_info)
+    st.session_state.history.append(entry)
+    st.session_state.model_results.append(entry)
+# --- REPORT GENERATORS ---
+def generate_text_report(df, target, model_results):
+    """Generate a comprehensive TXT report with every detail."""
+    best = max(model_results, key=lambda x: x["Score"]) if model_results else None
+    lines = []
+    lines.append("=" * 70)
+    lines.append("  DARK AI AUTOML PLATFORM - FULL REPORT")
+    lines.append("=" * 70)
+    lines.append(f"  Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    lines.append("")
+    lines.append("-" * 70)
+    lines.append("  DATASET SUMMARY")
+    lines.append("-" * 70)
+    lines.append(f"  Rows: {df.shape[0]}")
+    lines.append(f"  Columns: {df.shape[1]}")
+    lines.append(f"  Target Column: {target}")
+    lines.append(f"  Target Unique Values: {df[target].nunique()}")
+    lines.append("")
+    lines.append("-" * 70)
+    lines.append("  COLUMN DETAILS")
+    lines.append("-" * 70)
+    for col in df.columns:
+        dtype = str(df[col].dtype)
+        nunique = df[col].nunique()
+        missing = df[col].isnull().sum()
+        lines.append(f"  {col}: type={dtype}, unique={nunique}, missing={missing}")
+    lines.append("")
+    lines.append("-" * 70)
+    lines.append("  MODEL RESULTS (ALL RUNS)")
+    lines.append("-" * 70)
+    for i, r in enumerate(model_results, 1):
+        lines.append("")
+        lines.append(f"  Run #{i}")
+        lines.append(f"  Model: {r['Model']}")
+        lines.append(f"  Accuracy/Score: {r['Score']:.2f}%")
+        lines.append(f"  Target Feature: {r.get('Target', 'N/A')}")
+        lines.append(f"  Features Used: {r.get('Features', 'N/A')}")
+        lines.append(f"  Timestamp: {r.get('Timestamp', 'N/A')}")
+        if "Precision" in r:
+            lines.append(f"  Precision: {r['Precision']:.2f}%")
+        if "Recall" in r:
+            lines.append(f"  Recall: {r['Recall']:.2f}%")
+        if "F1Score" in r:
+            lines.append(f"  F1 Score: {r['F1Score']:.2f}%")
+        if "BestParams" in r:
+            lines.append(f"  Best Hyperparameters: {r['BestParams']}")
+        if "OutliersClipped" in r:
+            lines.append(f"  Outliers Clipped: {r['OutliersClipped']} columns")
+        if "LowVarRemoved" in r:
+            lines.append(f"  Low Variance Features Removed: {r['LowVarRemoved']}")
+        if "HighCorrRemoved" in r:
+            lines.append(f"  High Correlation Features Removed: {r['HighCorrRemoved']}")
+        if "ClassBalanced" in r:
+            lines.append(f"  Class Balancing Applied: {r['ClassBalanced']}")
+        if "BestK" in r:
+            lines.append(f"  Optimal Clusters (k): {r['BestK']}")
+    if best:
+        lines.append("")
+        lines.append("-" * 70)
+        lines.append("  BEST MODEL")
+        lines.append("-" * 70)
+        lines.append(f"  Model: {best['Model']}")
+        lines.append(f"  Score: {best['Score']:.2f}%")
+        lines.append(f"  Target: {best.get('Target', 'N/A')}")
+    lines.append("")
+    lines.append("-" * 70)
+    lines.append("  PREPROCESSING PIPELINE")
+    lines.append("-" * 70)
+    lines.append("  - Duplicate removal")
+    lines.append("  - Missing values handled (median for numeric, mode for categorical)")
+    lines.append("  - Unit conversion (km/cm/mm -> m)")
+    lines.append("  - Categorical encoding (LabelEncoder)")
+    lines.append("  - Outlier clipping (IQR method)")
+    lines.append("  - Low variance feature removal")
+    lines.append("  - High correlation feature removal")
+    lines.append("  - Class imbalance handling (oversampling)")
+    lines.append("  - Feature selection (mutual information, top 20)")
+    lines.append("  - Scaling where required (StandardScaler / RobustScaler)")
+    lines.append("  - Hyperparameter tuning (GridSearchCV)")
+    lines.append("  - Stratified cross-validation (5-fold)")
+    lines.append("")
+    lines.append("=" * 70)
+    lines.append("  END OF REPORT")
+    lines.append("=" * 70)
+    return "\n".join(lines)
+def generate_xlsx_report(df, target, model_results):
+    """Generate a multi-sheet XLSX report with every detail."""
+    output = io.BytesIO()
+    with pd.ExcelWriter(output, engine="openpyxl") as writer:
+        # Sheet 1: Dataset Summary
+        summary = pd.DataFrame({
+            "Property": ["Rows", "Columns", "Target Column", "Target Unique Values"],
+            "Value": [df.shape[0], df.shape[1], target, df[target].nunique()]
+        })
+        summary.to_excel(writer, sheet_name="Dataset Summary", index=False)
+        # Sheet 2: Column Details
+        col_details = []
+        for col in df.columns:
+            col_details.append({
+                "Column": col,
+                "Type": str(df[col].dtype),
+                "Unique Values": df[col].nunique(),
+                "Missing Values": df[col].isnull().sum(),
+            })
+        pd.DataFrame(col_details).to_excel(writer, sheet_name="Column Details", index=False)
+        # Sheet 3: Model Results
+        results_df = pd.DataFrame(model_results)
+        results_df.to_excel(writer, sheet_name="Model Results", index=False)
+        # Sheet 4: Best Model
+        if model_results:
+            best = max(model_results, key=lambda x: x["Score"])
+            pd.DataFrame([best]).to_excel(writer, sheet_name="Best Model", index=False)
+    output.seek(0)
+    return output
+# ==========================================================
+# UPLOAD
+# ==========================================================
+st.markdown('<div class="section">📁 Upload Dataset</div>', unsafe_allow_html=True)
+file = st.file_uploader("Upload CSV File", type=["csv"])
+# ==========================================================
+# MAIN APP
+# ==========================================================
+if file:
+    raw = pd.read_csv(file)
+    st.markdown('<div class="section">📌 Dataset Preview</div>', unsafe_allow_html=True)
+    st.dataframe(raw.head(), use_container_width=True)
+    df = smart_clean(raw)
+    df = detect_unit_columns(df)
+    st.session_state.cleaned_df = df
+    # ------------------------------------------------------
+    # TARGET DETECTION
+    # ------------------------------------------------------
+    st.markdown('<div class="section">🎯 AI Target Detection</div>', unsafe_allow_html=True)
+    best_target, top5 = detect_best_target(df)
+    st.success(f"Recommended Target Column: {best_target}")
+    st.write("Top Suggestions:")
+    for n, s in top5:
+        st.write(f"• {n} (score: {s})")
+    # Dropdown with AI recommendation pre-selected, user can override
+    target = st.selectbox(
+        "Choose Target Column (AI recommended is pre-selected - change if needed)",
+        [best_target] + [c for c in df.columns if c != best_target]
+    )
+    st.session_state.selected_target = target
+    # ------------------------------------------------------
+    # MODEL SELECT
+    # ------------------------------------------------------
+    st.markdown('<div class="section">🤖 Choose Model</div>', unsafe_allow_html=True)
+    model_choice = st.selectbox(
+        "Select One Model",
+        [
+            "Random Forest",
+            "SVM",
+            "Logistic Regression",
+            "Decision Tree",
+            "KMeans Clustering"
+        ]
+    )
+    # ------------------------------------------------------
+    # APPLY MODEL
+    # ------------------------------------------------------
+    if st.button("🚀 Apply Model"):
+        # Each model result is in its own container so
+        #  applying a second model shows results separately beneath the first
+        # RANDOM FOREST
+        if model_choice == "Random Forest":
+            X, y, transformed, pp_info = preprocess_for_model(df, target)
+            features_used = pp_info["features_used"]
+            result_box = st.container()
+            with result_box:
+                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
+                st.markdown(f"### Random Forest Results (Target: {target})")
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.write("Original")
+                    st.dataframe(raw.head())
+                with col2:
+                    st.write("Processed")
+                    st.dataframe(transformed.head())
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X, y, test_size=0.2, random_state=42, stratify=y
+                )
+                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+                model = GridSearchCV(
+                    RandomForestClassifier(),
+                    {
+                        "n_estimators":[100,200,300],
+                        "max_depth":[5,10,15,None],
+                        "min_samples_split":[2,5],
+                        "min_samples_leaf":[1,2]
+                    },
+                    cv=cv,
+                    n_jobs=-1
+                )
+                model.fit(X_train, y_train)
+                pred = model.predict(X_test)
+                acc = accuracy_score(y_test, pred)*100
+                prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
+                rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
+                f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
+                st.success(f"Accuracy: {acc:.2f}%")
+                st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
+                show_confusion(y_test, pred, "Random Forest Matrix")
+                imp = pd.Series(
+                    model.best_estimator_.feature_importances_,
+                    index=X.columns
+                ).sort_values(ascending=False).head(8)
+                compact_bar(imp.index, imp.values, "Feature Importance")
+                st.write("**Classification Report:**")
+                st.text(classification_report(y_test, pred, zero_division=0))
+                st.markdown('</div>', unsafe_allow_html=True)
+            joblib.dump(model.best_estimator_, "random_forest.pkl")
+            save_result("Random Forest", acc, target, ", ".join(features_used), {
+                "Precision": prec,
+                "Recall": rec,
+                "F1Score": f1,
+                "BestParams": str(model.best_params_),
+                "OutliersClipped": len(pp_info["outliers_clipped"]),
+                "LowVarRemoved": str(pp_info["low_var_removed"]),
+                "HighCorrRemoved": str(pp_info["high_corr_removed"]),
+                "ClassBalanced": pp_info["class_balanced"],
+            })
+        # SVM
+        elif model_choice == "SVM":
+            X, y, transformed, pp_info = preprocess_for_model(df, target)
+            features_used = pp_info["features_used"]
+            result_box = st.container()
+            with result_box:
+                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
+                st.markdown(f"### SVM Results (Target: {target})")
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X, y, test_size=0.2, random_state=42, stratify=y
+                )
+                # RobustScaler for SVM (handles outliers better)
+                sc = RobustScaler()
+                X_train = sc.fit_transform(X_train)
+                X_test = sc.transform(X_test)
+                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+                model = GridSearchCV(
+                    SVC(),
+                    {
+                        "C":[0.1,1,10,100],
+                        "kernel":["rbf","linear","poly"],
+                        "gamma":["scale","auto"]
+                    },
+                    cv=cv,
+                    n_jobs=-1
+                )
+                model.fit(X_train, y_train)
+                pred = model.predict(X_test)
+                acc = accuracy_score(y_test, pred)*100
+                prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
+                rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
+                f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
+                st.success(f"Accuracy: {acc:.2f}%")
+                st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
+                show_confusion(y_test, pred, "SVM Matrix")
+                st.write("**Classification Report:**")
+                st.text(classification_report(y_test, pred, zero_division=0))
+                st.markdown('</div>', unsafe_allow_html=True)
+            joblib.dump(model.best_estimator_, "svm.pkl")
+            save_result("SVM", acc, target, ", ".join(features_used), {
+                "Precision": prec,
+                "Recall": rec,
+                "F1Score": f1,
+                "BestParams": str(model.best_params_),
+                "OutliersClipped": len(pp_info["outliers_clipped"]),
+                "LowVarRemoved": str(pp_info["low_var_removed"]),
+                "HighCorrRemoved": str(pp_info["high_corr_removed"]),
+                "ClassBalanced": pp_info["class_balanced"],
+            })
+        # LOGISTIC
+        elif model_choice == "Logistic Regression":
+            X, y, transformed, pp_info = preprocess_for_model(df, target)
+            features_used = pp_info["features_used"]
+            result_box = st.container()
+            with result_box:
+                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
+                st.markdown(f"### Logistic Regression Results (Target: {target})")
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X, y, test_size=0.2, random_state=42, stratify=y
+                )
+                sc = StandardScaler()
+                X_train = sc.fit_transform(X_train)
+                X_test = sc.transform(X_test)
+                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+                model = GridSearchCV(
+                    LogisticRegression(max_iter=5000, solver="liblinear"),
+                    {
+                        "C":[0.01,0.1,1,10,100],
+                        "penalty":["l1","l2"]
+                    },
+                    cv=cv,
+                    n_jobs=-1
+                )
+                model.fit(X_train, y_train)
+                pred = model.predict(X_test)
+                acc = accuracy_score(y_test, pred)*100
+                prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
+                rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
+                f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
+                st.success(f"Accuracy: {acc:.2f}%")
+                st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
+                show_confusion(y_test, pred, "Logistic Regression Matrix")
+                # Show coefficient magnitudes for logistic regression
+                if hasattr(model.best_estimator_, "coef_"):
+                    coef = pd.Series(
+                        np.abs(model.best_estimator_.coef_[0]),
+                        index=X.columns
+                    ).sort_values(ascending=False).head(8)
+                    compact_bar(coef.index, coef.values, "Feature Coefficients (Absolute)")
+                st.write("**Classification Report:**")
+                st.text(classification_report(y_test, pred, zero_division=0))
+                st.markdown('</div>', unsafe_allow_html=True)
+            joblib.dump(model.best_estimator_, "logistic.pkl")
+            save_result("Logistic Regression", acc, target, ", ".join(features_used), {
+                "Precision": prec,
+                "Recall": rec,
+                "F1Score": f1,
+                "BestParams": str(model.best_params_),
+                "OutliersClipped": len(pp_info["outliers_clipped"]),
+                "LowVarRemoved": str(pp_info["low_var_removed"]),
+                "HighCorrRemoved": str(pp_info["high_corr_removed"]),
+                "ClassBalanced": pp_info["class_balanced"],
+            })
+        # DECISION TREE
+        elif model_choice == "Decision Tree":
+            X, y, transformed, pp_info = preprocess_for_model(df, target)
+            features_used = pp_info["features_used"]
+            result_box = st.container()
+            with result_box:
+                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
+                st.markdown(f"### Decision Tree Results (Target: {target})")
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X, y, test_size=0.2, random_state=42, stratify=y
+                )
+                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
+                model = GridSearchCV(
+                    DecisionTreeClassifier(),
+                    {
+                        "max_depth":[3,5,10,15,None],
+                        "min_samples_split":[2,5,10],
+                        "min_samples_leaf":[1,2,4],
+                        "criterion":["gini","entropy"]
+                    },
+                    cv=cv,
+                    n_jobs=-1
+                )
+                model.fit(X_train, y_train)
+                pred = model.predict(X_test)
+                acc = accuracy_score(y_test, pred)*100
+                prec = precision_score(y_test, pred, average="weighted", zero_division=0)*100
+                rec = recall_score(y_test, pred, average="weighted", zero_division=0)*100
+                f1 = f1_score(y_test, pred, average="weighted", zero_division=0)*100
+                st.success(f"Accuracy: {acc:.2f}%")
+                st.info(f"Precision: {prec:.2f}% | Recall: {rec:.2f}% | F1: {f1:.2f}%")
+                show_confusion(y_test, pred, "Decision Tree Matrix")
+                # Feature importance for decision tree
+                imp = pd.Series(
+                    model.best_estimator_.feature_importances_,
+                    index=X.columns
+                ).sort_values(ascending=False).head(8)
+                compact_bar(imp.index, imp.values, "Feature Importance")
+                st.write("**Classification Report:**")
+                st.text(classification_report(y_test, pred, zero_division=0))
+                st.markdown('</div>', unsafe_allow_html=True)
+            joblib.dump(model.best_estimator_, "decision_tree.pkl")
+            save_result("Decision Tree", acc, target, ", ".join(features_used), {
+                "Precision": prec,
+                "Recall": rec,
+                "F1Score": f1,
+                "BestParams": str(model.best_params_),
+                "OutliersClipped": len(pp_info["outliers_clipped"]),
+                "LowVarRemoved": str(pp_info["low_var_removed"]),
+                "HighCorrRemoved": str(pp_info["high_corr_removed"]),
+                "ClassBalanced": pp_info["class_balanced"],
+            })
+        # KMEANS
+        elif model_choice == "KMeans Clustering":
+            temp = df.copy()
+            for col in temp.columns:
+                if temp[col].dtype == "object":
+                    le = LabelEncoder()
+                    temp[col] = le.fit_transform(temp[col].astype(str))
+            X = temp.drop(columns=[target])
+            # Clip outliers for clustering too
+            temp_clipped, outlier_info = clip_outliers_iqr(temp)
+            X_clipped = temp_clipped.drop(columns=[target])
+            sc = StandardScaler()
+            Xs = sc.fit_transform(X_clipped)
+            # Find optimal k using elbow method
+            inertias = []
+            K_range = range(2, min(11, len(df) // 10 + 1))
+            for k in K_range:
+                km = KMeans(n_clusters=k, random_state=42, n_init=10)
+                km.fit(Xs)
+                inertias.append(km.inertia_)
+            best_k = 3
+            if len(inertias) >= 3:
+                diffs = [inertias[i] - inertias[i+1] for i in range(len(inertias)-1)]
+                if diffs:
+                    elbow_idx = np.argmax(diffs) + 1
+                    best_k = list(K_range)[elbow_idx] if elbow_idx < len(list(K_range)) else 3
+                    best_k = max(2, min(best_k, 10))
+            result_box = st.container()
+            with result_box:
+                st.markdown('<div class="model-result-box">', unsafe_allow_html=True)
+                st.markdown(f"### KMeans Clustering Results (Target: {target})")
+                model = KMeans(n_clusters=best_k, random_state=42, n_init=10)
+                cluster = model.fit_predict(Xs)
+                score = silhouette_score(Xs, cluster)*100
+                st.success(f"Cluster Quality Score: {score:.2f}% (k={best_k})")
+                fig, ax = plt.subplots(figsize=(6,4))
+                plt.scatter(Xs[:,0], Xs[:,1], c=cluster, cmap="viridis")
+                plt.title(f"Clusters (k={best_k})")
+                col1, col2, col3 = st.columns([1, 2, 1])
+                with col2:
+                    st.pyplot(fig)
+                # Elbow plot
+                fig2, ax2 = plt.subplots(figsize=(6,3))
+                plt.plot(list(K_range), inertias, "bo-")
+                plt.xlabel("Number of Clusters (k)")
+                plt.ylabel("Inertia")
+                plt.title("Elbow Method")
+                col1, col2, col3 = st.columns([1, 2, 1])
+                with col2:
+                    st.pyplot(fig2)
+                # Cluster distribution
+                cluster_counts = pd.Series(cluster).value_counts().sort_index()
+                fig3, ax3 = plt.subplots(figsize=(6,3))
+                sns.barplot(x=cluster_counts.index, y=cluster_counts.values)
+                plt.xlabel("Cluster")
+                plt.ylabel("Count")
+                plt.title("Cluster Distribution")
+                col1, col2, col3 = st.columns([1, 2, 1])
+                with col2:
+                    st.pyplot(fig3)
+                st.markdown('</div>', unsafe_allow_html=True)
+            joblib.dump(model, "kmeans.pkl")
+            save_result("KMeans Clustering", score, target, ", ".join(X_clipped.columns), {
+                "BestK": best_k,
+                "OutliersClipped": len(outlier_info),
+            })
+# ==========================================================
+# DOWNLOAD SECTION
+# ==========================================================
+if st.session_state.last_model_name:
+    st.markdown('<div class="section">⬇ Downloads</div>', unsafe_allow_html=True)
+    file_map = {
+        "Random Forest":"random_forest.pkl",
+        "SVM":"svm.pkl",
+        "Logistic Regression":"logistic.pkl",
+        "Decision Tree":"decision_tree.pkl",
+        "KMeans Clustering":"kmeans.pkl"
+    }
+    current = file_map[st.session_state.last_model_name]
+    if os.path.exists(current):
+        with open(current, "rb") as f:
+            st.download_button(
+                label=f"Download {st.session_state.last_model_name} (Deploy Ready)",
+                data=f,
+                file_name=current,
+                mime="application/octet-stream"
+            )
+# ==========================================================
+# HISTORY + REPORTS
+# ==========================================================
+if len(st.session_state.history) > 0:
+    st.markdown('<div class="section">📊 History</div>', unsafe_allow_html=True)
+    hist = pd.DataFrame(st.session_state.history)
+    st.dataframe(hist, use_container_width=True)
+    fig, ax = plt.subplots(figsize=(6,3))
+    sns.barplot(data=hist, x="Model", y="Score")
+    plt.xticks(rotation=20)
+    plt.title("All Applied Models")
+    col1, col2, col3 = st.columns([1, 2, 1])
+    with col2:
+        st.pyplot(fig)
+    # CSV
+    csv_buffer = io.StringIO()
+    hist.to_csv(csv_buffer, index=False)
+    st.download_button(
+        "Download Results CSV",
+        csv_buffer.getvalue(),
+        "results.csv"
+    )
+    # TXT report
+    if st.session_state.cleaned_df is not None and len(st.session_state.model_results) > 0:
+        report_text = generate_text_report(
+            st.session_state.cleaned_df,
+            st.session_state.selected_target or "unknown",
+            st.session_state.model_results
+        )
+        st.download_button(
+            "Download Full Report (TXT)",
+            report_text,
+            "full_report.txt",
+            mime="text/plain"
+        )
+        # XLSX report
+        try:
+            xlsx_data = generate_xlsx_report(
+                st.session_state.cleaned_df,
+                st.session_state.selected_target or "unknown",
+                st.session_state.model_results
+            )
+            st.download_button(
+                "Download Full Report (XLSX)",
+                data=xlsx_data.getvalue(),
+                file_name="full_report.xlsx",
+                mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+            )
+        except Exception:
+            pass
+# ==========================================================
+# RESET
+# ==========================================================
+st.markdown('<div class="section">♻ Reset</div>', unsafe_allow_html=True)
+if st.button("Clear History"):
+    st.session_state.history = []
+    st.session_state.last_model_name = None
+    st.session_state.last_score = None
+    st.session_state.model_results = []
+    st.session_state.selected_target = None
+    st.session_state.cleaned_df = None
+    st.success("History Cleared")

random_forest.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8aa8be408ccf6fb6ec8b9937082a4d9db1b9129c3d2b1c462377ba172ae805b2
+size 2105289

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# requirements.txt
+streamlit
+pandas
+numpy
+matplotlib
+seaborn
+scikit-learn
+joblib
+python-docx
+python-pptx
+openpyxl