Spaces:

Jurk06
/

Auto-ML

Sleeping

App Files Files Community

Jurk06 commited on Apr 10

Commit

07b52c3

verified ·

1 Parent(s): 53cb357

Create app.py

Browse files

Files changed (1) hide show

app.py +155 -0

app.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import gradio as gr
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
+from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
+from sklearn.svm import SVC, SVR
+from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
+from sklearn.metrics import classification_report, mean_squared_error, r2_score, precision_score, recall_score, f1_score
+from io import StringIO
+import requests
+# Helper functions
+def load_data(file=None, url=None):
+    if url:
+        content = requests.get(url).content.decode('utf-8')
+        df = pd.read_csv(StringIO(content))
+    else:
+        df = pd.read_csv(file.name)
+    return df
+def basic_eda(df):
+    info = {
+        "Shape": df.shape,
+        "Columns": df.columns.tolist(),
+        "Missing Values": df.isnull().sum().to_dict(),
+        "Data Types": df.dtypes.astype(str).to_dict(),
+        "Description": df.describe(include='all').to_dict(),
+    }
+    return info
+def impute_missing(df):
+    num_cols = df.select_dtypes(include=np.number).columns.tolist()
+    cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()
+    if num_cols:
+        imputed = SimpleImputer(strategy='mean').fit_transform(df[num_cols])
+        df[num_cols] = pd.DataFrame(imputed, columns=num_cols, index=df.index)
+    if cat_cols:
+        imputed = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols])
+        df[cat_cols] = pd.DataFrame(imputed, columns=cat_cols, index=df.index)
+    return df
+def detect_outliers(df):
+    numeric_df = df.select_dtypes(include=np.number)
+    z_scores = (numeric_df - numeric_df.mean()) / numeric_df.std()
+    return df[(z_scores < 3).all(axis=1)]
+def train_models(df, target, task):
+    X = df.drop(columns=[target])
+    y = df[target]
+    X = pd.get_dummies(X)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    results_table = []
+    if task == 'classification':
+        models = [
+            RandomForestClassifier(),
+            LogisticRegression(max_iter=1000),
+            GradientBoostingClassifier(),
+            KNeighborsClassifier(),
+            SVC()
+        ]
+        for model in models:
+            model.fit(X_train, y_train)
+            y_train_pred = model.predict(X_train)
+            y_test_pred = model.predict(X_test)
+            results_table.append({
+                "Model": model.__class__.__name__,
+                "Train Precision": precision_score(y_train, y_train_pred, average='weighted', zero_division=0),
+                "Train Recall": recall_score(y_train, y_train_pred, average='weighted', zero_division=0),
+                "Train F1-Score": f1_score(y_train, y_train_pred, average='weighted', zero_division=0),
+                "Test Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
+                "Test Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
+                "Test F1-Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
+            })
+    else:
+        models = [
+            RandomForestRegressor(),
+            LinearRegression(),
+            GradientBoostingRegressor(),
+            KNeighborsRegressor(),
+            Ridge()
+        ]
+        for model in models:
+            model.fit(X_train, y_train)
+            y_train_pred = model.predict(X_train)
+            y_test_pred = model.predict(X_test)
+            r2_train = r2_score(y_train, y_train_pred)
+            r2_test = r2_score(y_test, y_test_pred)
+            adj_r2_train = 1 - (1 - r2_train) * ((len(y_train) - 1)/(len(y_train) - X_train.shape[1] - 1))
+            adj_r2_test = 1 - (1 - r2_test) * ((len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1))
+            rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
+            rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
+            results_table.append({
+                "Model": model.__class__.__name__,
+                "Train R2": round(r2_train, 4),
+                "Train Adjusted R2": round(adj_r2_train, 4),
+                "Train RMSE": round(rmse_train, 4),
+                "Test R2": round(r2_test, 4),
+                "Test Adjusted R2": round(adj_r2_test, 4),
+                "Test RMSE": round(rmse_test, 4)
+            })
+    return pd.DataFrame(results_table)
+def visualize(df, x_col, y_col):
+    plt.figure(figsize=(8, 6))
+    if y_col:
+        sns.scatterplot(data=df, x=x_col, y=y_col)
+    else:
+        sns.histplot(df[x_col], kde=True)
+    plt.tight_layout()
+    plt.savefig("plot.png")
+    plt.close()
+    return "plot.png"
+# Gradio UI
+def process(file, url, task, target, x_feature, y_feature):
+    df = load_data(file, url)
+    eda = basic_eda(df)
+    df = impute_missing(df)
+    df = detect_outliers(df)
+    plot_path = visualize(df, x_feature, y_feature)
+    results_df = train_models(df, target, task)
+    return eda, plot_path, results_df
+demo = gr.Interface(
+    fn=process,
+    inputs=[
+        gr.File(label="Upload CSV File", file_types=['.csv'], optional=True),
+        gr.Textbox(label="Or enter URL to CSV", placeholder="https://...", lines=1, optional=True),
+        gr.Radio(["classification", "regression"], label="Select Task Type"),
+        gr.Textbox(label="Target Column Name"),
+        gr.Textbox(label="Feature for X-Axis (for visualization)"),
+        gr.Textbox(label="Feature for Y-Axis (optional, for scatter plot)"),
+    ],
+    outputs=[
+        gr.JSON(label="Basic EDA"),
+        gr.Image(type="filepath", label="Feature Plot"),
+        gr.Dataframe(label="Model Performance")
+    ],
+    title="AutoML Dashboard",
+    description="Upload a dataset or provide a URL. Select task type, enter target column, choose features to visualize, and evaluate models."
+)
+if __name__ == "__main__":
+    demo.launch()