|
import gradio as gr |
|
import pandas as pd |
|
import numpy as np |
|
import seaborn as sns |
|
import matplotlib.pyplot as plt |
|
from sklearn.impute import SimpleImputer |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor |
|
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge |
|
from sklearn.svm import SVC, SVR |
|
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor |
|
from sklearn.metrics import classification_report, mean_squared_error, r2_score, precision_score, recall_score, f1_score |
|
from io import StringIO |
|
import requests |
|
|
|
|
|
def load_data(file=None, url=None): |
|
if url: |
|
content = requests.get(url).content.decode('utf-8') |
|
df = pd.read_csv(StringIO(content)) |
|
else: |
|
df = pd.read_csv(file.name) |
|
return df |
|
|
|
def basic_eda(df): |
|
info = { |
|
"Shape": df.shape, |
|
"Columns": df.columns.tolist(), |
|
"Missing Values": df.isnull().sum().to_dict(), |
|
"Data Types": df.dtypes.astype(str).to_dict(), |
|
"Description": df.describe(include='all').to_dict(), |
|
} |
|
return info |
|
|
|
def impute_missing(df): |
|
num_cols = df.select_dtypes(include=np.number).columns.tolist() |
|
cat_cols = df.select_dtypes(exclude=np.number).columns.tolist() |
|
|
|
if num_cols: |
|
imputed = SimpleImputer(strategy='mean').fit_transform(df[num_cols]) |
|
df[num_cols] = pd.DataFrame(imputed, columns=num_cols, index=df.index) |
|
if cat_cols: |
|
imputed = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols]) |
|
df[cat_cols] = pd.DataFrame(imputed, columns=cat_cols, index=df.index) |
|
|
|
return df |
|
|
|
def detect_outliers(df): |
|
numeric_df = df.select_dtypes(include=np.number) |
|
z_scores = (numeric_df - numeric_df.mean()) / numeric_df.std() |
|
return df[(z_scores < 3).all(axis=1)] |
|
|
|
def train_models(df, target, task): |
|
X = df.drop(columns=[target]) |
|
y = df[target] |
|
|
|
X = pd.get_dummies(X) |
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
|
|
|
results_table = [] |
|
|
|
if task == 'classification': |
|
models = [ |
|
RandomForestClassifier(), |
|
LogisticRegression(max_iter=1000), |
|
GradientBoostingClassifier(), |
|
KNeighborsClassifier(), |
|
SVC() |
|
] |
|
for model in models: |
|
model.fit(X_train, y_train) |
|
y_train_pred = model.predict(X_train) |
|
y_test_pred = model.predict(X_test) |
|
results_table.append({ |
|
"Model": model.__class__.__name__, |
|
"Train Precision": precision_score(y_train, y_train_pred, average='weighted', zero_division=0), |
|
"Train Recall": recall_score(y_train, y_train_pred, average='weighted', zero_division=0), |
|
"Train F1-Score": f1_score(y_train, y_train_pred, average='weighted', zero_division=0), |
|
"Test Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0), |
|
"Test Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0), |
|
"Test F1-Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0) |
|
}) |
|
|
|
else: |
|
models = [ |
|
RandomForestRegressor(), |
|
LinearRegression(), |
|
GradientBoostingRegressor(), |
|
KNeighborsRegressor(), |
|
Ridge() |
|
] |
|
for model in models: |
|
model.fit(X_train, y_train) |
|
y_train_pred = model.predict(X_train) |
|
y_test_pred = model.predict(X_test) |
|
r2_train = r2_score(y_train, y_train_pred) |
|
r2_test = r2_score(y_test, y_test_pred) |
|
adj_r2_train = 1 - (1 - r2_train) * ((len(y_train) - 1)/(len(y_train) - X_train.shape[1] - 1)) |
|
adj_r2_test = 1 - (1 - r2_test) * ((len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1)) |
|
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred)) |
|
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred)) |
|
results_table.append({ |
|
"Model": model.__class__.__name__, |
|
"Train R2": round(r2_train, 4), |
|
"Train Adjusted R2": round(adj_r2_train, 4), |
|
"Train RMSE": round(rmse_train, 4), |
|
"Test R2": round(r2_test, 4), |
|
"Test Adjusted R2": round(adj_r2_test, 4), |
|
"Test RMSE": round(rmse_test, 4) |
|
}) |
|
|
|
return pd.DataFrame(results_table) |
|
|
|
def visualize(df, x_col, y_col): |
|
plt.figure(figsize=(8, 6)) |
|
if y_col: |
|
sns.scatterplot(data=df, x=x_col, y=y_col) |
|
else: |
|
sns.histplot(df[x_col], kde=True) |
|
plt.tight_layout() |
|
plt.savefig("plot.png") |
|
plt.close() |
|
return "plot.png" |
|
|
|
|
|
def process(file, url, task, target, x_feature, y_feature): |
|
df = load_data(file, url) |
|
eda = basic_eda(df) |
|
df = impute_missing(df) |
|
df = detect_outliers(df) |
|
plot_path = visualize(df, x_feature, y_feature) |
|
results_df = train_models(df, target, task) |
|
return eda, plot_path, results_df |
|
|
|
demo = gr.Interface( |
|
fn=process, |
|
inputs=[ |
|
gr.File(label="Upload CSV File", file_types=['.csv']), |
|
gr.Textbox(label="Or enter URL to CSV", placeholder="https://...", lines=1), |
|
gr.Radio(["classification", "regression"], label="Select Task Type"), |
|
gr.Textbox(label="Target Column Name"), |
|
gr.Textbox(label="Feature for X-Axis (for visualization)"), |
|
gr.Textbox(label="Feature for Y-Axis (optional, for scatter plot)"), |
|
], |
|
outputs=[ |
|
gr.JSON(label="Basic EDA"), |
|
gr.Image(type="filepath", label="Feature Plot"), |
|
gr.Dataframe(label="Model Performance") |
|
], |
|
title="AutoML Dashboard", |
|
description="Upload a dataset or provide a URL. Select task type, enter target column, choose features to visualize, and evaluate models." |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|