Spaces:

Jurk06
/

Auto-ML

Sleeping

File size: 6,086 Bytes

import gradio as gr
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import classification_report, mean_squared_error, r2_score, precision_score, recall_score, f1_score
from io import StringIO
import requests

# Helper functions
def load_data(file=None, url=None):
    if url:
        content = requests.get(url).content.decode('utf-8')
        df = pd.read_csv(StringIO(content))
    else:
        df = pd.read_csv(file.name)
    return df

def basic_eda(df):
    info = {
        "Shape": df.shape,
        "Columns": df.columns.tolist(),
        "Missing Values": df.isnull().sum().to_dict(),
        "Data Types": df.dtypes.astype(str).to_dict(),
        "Description": df.describe(include='all').to_dict(),
    }
    return info

def impute_missing(df):
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()

    if num_cols:
        imputed = SimpleImputer(strategy='mean').fit_transform(df[num_cols])
        df[num_cols] = pd.DataFrame(imputed, columns=num_cols, index=df.index)
    if cat_cols:
        imputed = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols])
        df[cat_cols] = pd.DataFrame(imputed, columns=cat_cols, index=df.index)

    return df

def detect_outliers(df):
    numeric_df = df.select_dtypes(include=np.number)
    z_scores = (numeric_df - numeric_df.mean()) / numeric_df.std()
    return df[(z_scores < 3).all(axis=1)]

def train_models(df, target, task):
    X = df.drop(columns=[target])
    y = df[target]

    X = pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    results_table = []

    if task == 'classification':
        models = [
            RandomForestClassifier(),
            LogisticRegression(max_iter=1000),
            GradientBoostingClassifier(),
            KNeighborsClassifier(),
            SVC()
        ]
        for model in models:
            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            results_table.append({
                "Model": model.__class__.__name__,
                "Train Precision": precision_score(y_train, y_train_pred, average='weighted', zero_division=0),
                "Train Recall": recall_score(y_train, y_train_pred, average='weighted', zero_division=0),
                "Train F1-Score": f1_score(y_train, y_train_pred, average='weighted', zero_division=0),
                "Test Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
                "Test Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
                "Test F1-Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
            })

    else:
        models = [
            RandomForestRegressor(),
            LinearRegression(),
            GradientBoostingRegressor(),
            KNeighborsRegressor(),
            Ridge()
        ]
        for model in models:
            model.fit(X_train, y_train)
            y_train_pred = model.predict(X_train)
            y_test_pred = model.predict(X_test)
            r2_train = r2_score(y_train, y_train_pred)
            r2_test = r2_score(y_test, y_test_pred)
            adj_r2_train = 1 - (1 - r2_train) * ((len(y_train) - 1)/(len(y_train) - X_train.shape[1] - 1))
            adj_r2_test = 1 - (1 - r2_test) * ((len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1))
            rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
            rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
            results_table.append({
                "Model": model.__class__.__name__,
                "Train R2": round(r2_train, 4),
                "Train Adjusted R2": round(adj_r2_train, 4),
                "Train RMSE": round(rmse_train, 4),
                "Test R2": round(r2_test, 4),
                "Test Adjusted R2": round(adj_r2_test, 4),
                "Test RMSE": round(rmse_test, 4)
            })

    return pd.DataFrame(results_table)

def visualize(df, x_col, y_col):
    plt.figure(figsize=(8, 6))
    if y_col:
        sns.scatterplot(data=df, x=x_col, y=y_col)
    else:
        sns.histplot(df[x_col], kde=True)
    plt.tight_layout()
    plt.savefig("plot.png")
    plt.close()
    return "plot.png"

# Gradio UI
def process(file, url, task, target, x_feature, y_feature):
    df = load_data(file, url)
    eda = basic_eda(df)
    df = impute_missing(df)
    df = detect_outliers(df)
    plot_path = visualize(df, x_feature, y_feature)
    results_df = train_models(df, target, task)
    return eda, plot_path, results_df

demo = gr.Interface(
    fn=process,
    inputs=[
        gr.File(label="Upload CSV File", file_types=['.csv']), #optional=True),
        gr.Textbox(label="Or enter URL to CSV", placeholder="https://...", lines=1), #optional=True),
        gr.Radio(["classification", "regression"], label="Select Task Type"),
        gr.Textbox(label="Target Column Name"),
        gr.Textbox(label="Feature for X-Axis (for visualization)"),
        gr.Textbox(label="Feature for Y-Axis (optional, for scatter plot)"),
    ],
    outputs=[
        gr.JSON(label="Basic EDA"),
        gr.Image(type="filepath", label="Feature Plot"),
        gr.Dataframe(label="Model Performance")
    ],
    title="AutoML Dashboard",
    description="Upload a dataset or provide a URL. Select task type, enter target column, choose features to visualize, and evaluate models."
)

if __name__ == "__main__":
    demo.launch()