Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import seaborn as sns
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
from sklearn.impute import SimpleImputer
|
7 |
+
from sklearn.model_selection import train_test_split
|
8 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
|
9 |
+
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
|
10 |
+
from sklearn.svm import SVC, SVR
|
11 |
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
12 |
+
from sklearn.metrics import classification_report, mean_squared_error, r2_score, precision_score, recall_score, f1_score
|
13 |
+
from io import StringIO
|
14 |
+
import requests
|
15 |
+
|
16 |
+
# Helper functions
|
17 |
+
def load_data(file=None, url=None):
|
18 |
+
if url:
|
19 |
+
content = requests.get(url).content.decode('utf-8')
|
20 |
+
df = pd.read_csv(StringIO(content))
|
21 |
+
else:
|
22 |
+
df = pd.read_csv(file.name)
|
23 |
+
return df
|
24 |
+
|
25 |
+
def basic_eda(df):
|
26 |
+
info = {
|
27 |
+
"Shape": df.shape,
|
28 |
+
"Columns": df.columns.tolist(),
|
29 |
+
"Missing Values": df.isnull().sum().to_dict(),
|
30 |
+
"Data Types": df.dtypes.astype(str).to_dict(),
|
31 |
+
"Description": df.describe(include='all').to_dict(),
|
32 |
+
}
|
33 |
+
return info
|
34 |
+
|
35 |
+
def impute_missing(df):
|
36 |
+
num_cols = df.select_dtypes(include=np.number).columns.tolist()
|
37 |
+
cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()
|
38 |
+
|
39 |
+
if num_cols:
|
40 |
+
imputed = SimpleImputer(strategy='mean').fit_transform(df[num_cols])
|
41 |
+
df[num_cols] = pd.DataFrame(imputed, columns=num_cols, index=df.index)
|
42 |
+
if cat_cols:
|
43 |
+
imputed = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols])
|
44 |
+
df[cat_cols] = pd.DataFrame(imputed, columns=cat_cols, index=df.index)
|
45 |
+
|
46 |
+
return df
|
47 |
+
|
48 |
+
def detect_outliers(df):
|
49 |
+
numeric_df = df.select_dtypes(include=np.number)
|
50 |
+
z_scores = (numeric_df - numeric_df.mean()) / numeric_df.std()
|
51 |
+
return df[(z_scores < 3).all(axis=1)]
|
52 |
+
|
53 |
+
def train_models(df, target, task):
|
54 |
+
X = df.drop(columns=[target])
|
55 |
+
y = df[target]
|
56 |
+
|
57 |
+
X = pd.get_dummies(X)
|
58 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
59 |
+
|
60 |
+
results_table = []
|
61 |
+
|
62 |
+
if task == 'classification':
|
63 |
+
models = [
|
64 |
+
RandomForestClassifier(),
|
65 |
+
LogisticRegression(max_iter=1000),
|
66 |
+
GradientBoostingClassifier(),
|
67 |
+
KNeighborsClassifier(),
|
68 |
+
SVC()
|
69 |
+
]
|
70 |
+
for model in models:
|
71 |
+
model.fit(X_train, y_train)
|
72 |
+
y_train_pred = model.predict(X_train)
|
73 |
+
y_test_pred = model.predict(X_test)
|
74 |
+
results_table.append({
|
75 |
+
"Model": model.__class__.__name__,
|
76 |
+
"Train Precision": precision_score(y_train, y_train_pred, average='weighted', zero_division=0),
|
77 |
+
"Train Recall": recall_score(y_train, y_train_pred, average='weighted', zero_division=0),
|
78 |
+
"Train F1-Score": f1_score(y_train, y_train_pred, average='weighted', zero_division=0),
|
79 |
+
"Test Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
|
80 |
+
"Test Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
|
81 |
+
"Test F1-Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
|
82 |
+
})
|
83 |
+
|
84 |
+
else:
|
85 |
+
models = [
|
86 |
+
RandomForestRegressor(),
|
87 |
+
LinearRegression(),
|
88 |
+
GradientBoostingRegressor(),
|
89 |
+
KNeighborsRegressor(),
|
90 |
+
Ridge()
|
91 |
+
]
|
92 |
+
for model in models:
|
93 |
+
model.fit(X_train, y_train)
|
94 |
+
y_train_pred = model.predict(X_train)
|
95 |
+
y_test_pred = model.predict(X_test)
|
96 |
+
r2_train = r2_score(y_train, y_train_pred)
|
97 |
+
r2_test = r2_score(y_test, y_test_pred)
|
98 |
+
adj_r2_train = 1 - (1 - r2_train) * ((len(y_train) - 1)/(len(y_train) - X_train.shape[1] - 1))
|
99 |
+
adj_r2_test = 1 - (1 - r2_test) * ((len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1))
|
100 |
+
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
|
101 |
+
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
|
102 |
+
results_table.append({
|
103 |
+
"Model": model.__class__.__name__,
|
104 |
+
"Train R2": round(r2_train, 4),
|
105 |
+
"Train Adjusted R2": round(adj_r2_train, 4),
|
106 |
+
"Train RMSE": round(rmse_train, 4),
|
107 |
+
"Test R2": round(r2_test, 4),
|
108 |
+
"Test Adjusted R2": round(adj_r2_test, 4),
|
109 |
+
"Test RMSE": round(rmse_test, 4)
|
110 |
+
})
|
111 |
+
|
112 |
+
return pd.DataFrame(results_table)
|
113 |
+
|
114 |
+
def visualize(df, x_col, y_col):
|
115 |
+
plt.figure(figsize=(8, 6))
|
116 |
+
if y_col:
|
117 |
+
sns.scatterplot(data=df, x=x_col, y=y_col)
|
118 |
+
else:
|
119 |
+
sns.histplot(df[x_col], kde=True)
|
120 |
+
plt.tight_layout()
|
121 |
+
plt.savefig("plot.png")
|
122 |
+
plt.close()
|
123 |
+
return "plot.png"
|
124 |
+
|
125 |
+
# Gradio UI
|
126 |
+
def process(file, url, task, target, x_feature, y_feature):
|
127 |
+
df = load_data(file, url)
|
128 |
+
eda = basic_eda(df)
|
129 |
+
df = impute_missing(df)
|
130 |
+
df = detect_outliers(df)
|
131 |
+
plot_path = visualize(df, x_feature, y_feature)
|
132 |
+
results_df = train_models(df, target, task)
|
133 |
+
return eda, plot_path, results_df
|
134 |
+
|
135 |
+
demo = gr.Interface(
|
136 |
+
fn=process,
|
137 |
+
inputs=[
|
138 |
+
gr.File(label="Upload CSV File", file_types=['.csv'], optional=True),
|
139 |
+
gr.Textbox(label="Or enter URL to CSV", placeholder="https://...", lines=1, optional=True),
|
140 |
+
gr.Radio(["classification", "regression"], label="Select Task Type"),
|
141 |
+
gr.Textbox(label="Target Column Name"),
|
142 |
+
gr.Textbox(label="Feature for X-Axis (for visualization)"),
|
143 |
+
gr.Textbox(label="Feature for Y-Axis (optional, for scatter plot)"),
|
144 |
+
],
|
145 |
+
outputs=[
|
146 |
+
gr.JSON(label="Basic EDA"),
|
147 |
+
gr.Image(type="filepath", label="Feature Plot"),
|
148 |
+
gr.Dataframe(label="Model Performance")
|
149 |
+
],
|
150 |
+
title="AutoML Dashboard",
|
151 |
+
description="Upload a dataset or provide a URL. Select task type, enter target column, choose features to visualize, and evaluate models."
|
152 |
+
)
|
153 |
+
|
154 |
+
if __name__ == "__main__":
|
155 |
+
demo.launch()
|