import gradio as gr import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge from sklearn.svm import SVC, SVR from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor from sklearn.metrics import classification_report, mean_squared_error, r2_score, precision_score, recall_score, f1_score from io import StringIO import requests # Helper functions def load_data(file=None, url=None): if url: content = requests.get(url).content.decode('utf-8') df = pd.read_csv(StringIO(content)) else: df = pd.read_csv(file.name) return df def basic_eda(df): info = { "Shape": df.shape, "Columns": df.columns.tolist(), "Missing Values": df.isnull().sum().to_dict(), "Data Types": df.dtypes.astype(str).to_dict(), "Description": df.describe(include='all').to_dict(), } return info def impute_missing(df): num_cols = df.select_dtypes(include=np.number).columns.tolist() cat_cols = df.select_dtypes(exclude=np.number).columns.tolist() if num_cols: imputed = SimpleImputer(strategy='mean').fit_transform(df[num_cols]) df[num_cols] = pd.DataFrame(imputed, columns=num_cols, index=df.index) if cat_cols: imputed = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols]) df[cat_cols] = pd.DataFrame(imputed, columns=cat_cols, index=df.index) return df def detect_outliers(df): numeric_df = df.select_dtypes(include=np.number) z_scores = (numeric_df - numeric_df.mean()) / numeric_df.std() return df[(z_scores < 3).all(axis=1)] def train_models(df, target, task): X = df.drop(columns=[target]) y = df[target] X = pd.get_dummies(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) results_table = [] if task == 'classification': models = [ RandomForestClassifier(), LogisticRegression(max_iter=1000), GradientBoostingClassifier(), KNeighborsClassifier(), SVC() ] for model in models: model.fit(X_train, y_train) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) results_table.append({ "Model": model.__class__.__name__, "Train Precision": precision_score(y_train, y_train_pred, average='weighted', zero_division=0), "Train Recall": recall_score(y_train, y_train_pred, average='weighted', zero_division=0), "Train F1-Score": f1_score(y_train, y_train_pred, average='weighted', zero_division=0), "Test Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0), "Test Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0), "Test F1-Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0) }) else: models = [ RandomForestRegressor(), LinearRegression(), GradientBoostingRegressor(), KNeighborsRegressor(), Ridge() ] for model in models: model.fit(X_train, y_train) y_train_pred = model.predict(X_train) y_test_pred = model.predict(X_test) r2_train = r2_score(y_train, y_train_pred) r2_test = r2_score(y_test, y_test_pred) adj_r2_train = 1 - (1 - r2_train) * ((len(y_train) - 1)/(len(y_train) - X_train.shape[1] - 1)) adj_r2_test = 1 - (1 - r2_test) * ((len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1)) rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred)) rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred)) results_table.append({ "Model": model.__class__.__name__, "Train R2": round(r2_train, 4), "Train Adjusted R2": round(adj_r2_train, 4), "Train RMSE": round(rmse_train, 4), "Test R2": round(r2_test, 4), "Test Adjusted R2": round(adj_r2_test, 4), "Test RMSE": round(rmse_test, 4) }) return pd.DataFrame(results_table) def visualize(df, x_col, y_col): plt.figure(figsize=(8, 6)) if y_col: sns.scatterplot(data=df, x=x_col, y=y_col) else: sns.histplot(df[x_col], kde=True) plt.tight_layout() plt.savefig("plot.png") plt.close() return "plot.png" # Gradio UI def process(file, url, task, target, x_feature, y_feature): df = load_data(file, url) eda = basic_eda(df) df = impute_missing(df) df = detect_outliers(df) plot_path = visualize(df, x_feature, y_feature) results_df = train_models(df, target, task) return eda, plot_path, results_df demo = gr.Interface( fn=process, inputs=[ gr.File(label="Upload CSV File", file_types=['.csv']), #optional=True), gr.Textbox(label="Or enter URL to CSV", placeholder="https://...", lines=1), #optional=True), gr.Radio(["classification", "regression"], label="Select Task Type"), gr.Textbox(label="Target Column Name"), gr.Textbox(label="Feature for X-Axis (for visualization)"), gr.Textbox(label="Feature for Y-Axis (optional, for scatter plot)"), ], outputs=[ gr.JSON(label="Basic EDA"), gr.Image(type="filepath", label="Feature Plot"), gr.Dataframe(label="Model Performance") ], title="AutoML Dashboard", description="Upload a dataset or provide a URL. Select task type, enter target column, choose features to visualize, and evaluate models." ) if __name__ == "__main__": demo.launch()