Jurk06 commited on
Commit
07b52c3
·
verified ·
1 Parent(s): 53cb357

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -0
app.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from sklearn.impute import SimpleImputer
7
+ from sklearn.model_selection import train_test_split
8
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
9
+ from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
10
+ from sklearn.svm import SVC, SVR
11
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
12
+ from sklearn.metrics import classification_report, mean_squared_error, r2_score, precision_score, recall_score, f1_score
13
+ from io import StringIO
14
+ import requests
15
+
16
+ # Helper functions
17
+ def load_data(file=None, url=None):
18
+ if url:
19
+ content = requests.get(url).content.decode('utf-8')
20
+ df = pd.read_csv(StringIO(content))
21
+ else:
22
+ df = pd.read_csv(file.name)
23
+ return df
24
+
25
+ def basic_eda(df):
26
+ info = {
27
+ "Shape": df.shape,
28
+ "Columns": df.columns.tolist(),
29
+ "Missing Values": df.isnull().sum().to_dict(),
30
+ "Data Types": df.dtypes.astype(str).to_dict(),
31
+ "Description": df.describe(include='all').to_dict(),
32
+ }
33
+ return info
34
+
35
+ def impute_missing(df):
36
+ num_cols = df.select_dtypes(include=np.number).columns.tolist()
37
+ cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()
38
+
39
+ if num_cols:
40
+ imputed = SimpleImputer(strategy='mean').fit_transform(df[num_cols])
41
+ df[num_cols] = pd.DataFrame(imputed, columns=num_cols, index=df.index)
42
+ if cat_cols:
43
+ imputed = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols])
44
+ df[cat_cols] = pd.DataFrame(imputed, columns=cat_cols, index=df.index)
45
+
46
+ return df
47
+
48
+ def detect_outliers(df):
49
+ numeric_df = df.select_dtypes(include=np.number)
50
+ z_scores = (numeric_df - numeric_df.mean()) / numeric_df.std()
51
+ return df[(z_scores < 3).all(axis=1)]
52
+
53
+ def train_models(df, target, task):
54
+ X = df.drop(columns=[target])
55
+ y = df[target]
56
+
57
+ X = pd.get_dummies(X)
58
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
59
+
60
+ results_table = []
61
+
62
+ if task == 'classification':
63
+ models = [
64
+ RandomForestClassifier(),
65
+ LogisticRegression(max_iter=1000),
66
+ GradientBoostingClassifier(),
67
+ KNeighborsClassifier(),
68
+ SVC()
69
+ ]
70
+ for model in models:
71
+ model.fit(X_train, y_train)
72
+ y_train_pred = model.predict(X_train)
73
+ y_test_pred = model.predict(X_test)
74
+ results_table.append({
75
+ "Model": model.__class__.__name__,
76
+ "Train Precision": precision_score(y_train, y_train_pred, average='weighted', zero_division=0),
77
+ "Train Recall": recall_score(y_train, y_train_pred, average='weighted', zero_division=0),
78
+ "Train F1-Score": f1_score(y_train, y_train_pred, average='weighted', zero_division=0),
79
+ "Test Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
80
+ "Test Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
81
+ "Test F1-Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
82
+ })
83
+
84
+ else:
85
+ models = [
86
+ RandomForestRegressor(),
87
+ LinearRegression(),
88
+ GradientBoostingRegressor(),
89
+ KNeighborsRegressor(),
90
+ Ridge()
91
+ ]
92
+ for model in models:
93
+ model.fit(X_train, y_train)
94
+ y_train_pred = model.predict(X_train)
95
+ y_test_pred = model.predict(X_test)
96
+ r2_train = r2_score(y_train, y_train_pred)
97
+ r2_test = r2_score(y_test, y_test_pred)
98
+ adj_r2_train = 1 - (1 - r2_train) * ((len(y_train) - 1)/(len(y_train) - X_train.shape[1] - 1))
99
+ adj_r2_test = 1 - (1 - r2_test) * ((len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1))
100
+ rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
101
+ rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
102
+ results_table.append({
103
+ "Model": model.__class__.__name__,
104
+ "Train R2": round(r2_train, 4),
105
+ "Train Adjusted R2": round(adj_r2_train, 4),
106
+ "Train RMSE": round(rmse_train, 4),
107
+ "Test R2": round(r2_test, 4),
108
+ "Test Adjusted R2": round(adj_r2_test, 4),
109
+ "Test RMSE": round(rmse_test, 4)
110
+ })
111
+
112
+ return pd.DataFrame(results_table)
113
+
114
+ def visualize(df, x_col, y_col):
115
+ plt.figure(figsize=(8, 6))
116
+ if y_col:
117
+ sns.scatterplot(data=df, x=x_col, y=y_col)
118
+ else:
119
+ sns.histplot(df[x_col], kde=True)
120
+ plt.tight_layout()
121
+ plt.savefig("plot.png")
122
+ plt.close()
123
+ return "plot.png"
124
+
125
+ # Gradio UI
126
+ def process(file, url, task, target, x_feature, y_feature):
127
+ df = load_data(file, url)
128
+ eda = basic_eda(df)
129
+ df = impute_missing(df)
130
+ df = detect_outliers(df)
131
+ plot_path = visualize(df, x_feature, y_feature)
132
+ results_df = train_models(df, target, task)
133
+ return eda, plot_path, results_df
134
+
135
+ demo = gr.Interface(
136
+ fn=process,
137
+ inputs=[
138
+ gr.File(label="Upload CSV File", file_types=['.csv'], optional=True),
139
+ gr.Textbox(label="Or enter URL to CSV", placeholder="https://...", lines=1, optional=True),
140
+ gr.Radio(["classification", "regression"], label="Select Task Type"),
141
+ gr.Textbox(label="Target Column Name"),
142
+ gr.Textbox(label="Feature for X-Axis (for visualization)"),
143
+ gr.Textbox(label="Feature for Y-Axis (optional, for scatter plot)"),
144
+ ],
145
+ outputs=[
146
+ gr.JSON(label="Basic EDA"),
147
+ gr.Image(type="filepath", label="Feature Plot"),
148
+ gr.Dataframe(label="Model Performance")
149
+ ],
150
+ title="AutoML Dashboard",
151
+ description="Upload a dataset or provide a URL. Select task type, enter target column, choose features to visualize, and evaluate models."
152
+ )
153
+
154
+ if __name__ == "__main__":
155
+ demo.launch()