Spaces:

Jurk06
/

Auto-ML

Sleeping

App Files Files Community

Auto-ML / app.py

Jurk06

Update app.py

f957c43 verified 3 months ago

raw

history blame contribute delete

6.09 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	from sklearn.impute import SimpleImputer
	from sklearn.model_selection import train_test_split
	from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
	from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
	from sklearn.svm import SVC, SVR
	from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
	from sklearn.metrics import classification_report, mean_squared_error, r2_score, precision_score, recall_score, f1_score
	from io import StringIO
	import requests

	# Helper functions
	def load_data(file=None, url=None):
	if url:
	content = requests.get(url).content.decode('utf-8')
	df = pd.read_csv(StringIO(content))
	else:
	df = pd.read_csv(file.name)
	return df

	def basic_eda(df):
	info = {
	"Shape": df.shape,
	"Columns": df.columns.tolist(),
	"Missing Values": df.isnull().sum().to_dict(),
	"Data Types": df.dtypes.astype(str).to_dict(),
	"Description": df.describe(include='all').to_dict(),
	}
	return info

	def impute_missing(df):
	num_cols = df.select_dtypes(include=np.number).columns.tolist()
	cat_cols = df.select_dtypes(exclude=np.number).columns.tolist()

	if num_cols:
	imputed = SimpleImputer(strategy='mean').fit_transform(df[num_cols])
	df[num_cols] = pd.DataFrame(imputed, columns=num_cols, index=df.index)
	if cat_cols:
	imputed = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols])
	df[cat_cols] = pd.DataFrame(imputed, columns=cat_cols, index=df.index)

	return df

	def detect_outliers(df):
	numeric_df = df.select_dtypes(include=np.number)
	z_scores = (numeric_df - numeric_df.mean()) / numeric_df.std()
	return df[(z_scores < 3).all(axis=1)]

	def train_models(df, target, task):
	X = df.drop(columns=[target])
	y = df[target]

	X = pd.get_dummies(X)
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	results_table = []

	if task == 'classification':
	models = [
	RandomForestClassifier(),
	LogisticRegression(max_iter=1000),
	GradientBoostingClassifier(),
	KNeighborsClassifier(),
	SVC()
	]
	for model in models:
	model.fit(X_train, y_train)
	y_train_pred = model.predict(X_train)
	y_test_pred = model.predict(X_test)
	results_table.append({
	"Model": model.__class__.__name__,
	"Train Precision": precision_score(y_train, y_train_pred, average='weighted', zero_division=0),
	"Train Recall": recall_score(y_train, y_train_pred, average='weighted', zero_division=0),
	"Train F1-Score": f1_score(y_train, y_train_pred, average='weighted', zero_division=0),
	"Test Precision": precision_score(y_test, y_test_pred, average='weighted', zero_division=0),
	"Test Recall": recall_score(y_test, y_test_pred, average='weighted', zero_division=0),
	"Test F1-Score": f1_score(y_test, y_test_pred, average='weighted', zero_division=0)
	})

	else:
	models = [
	RandomForestRegressor(),
	LinearRegression(),
	GradientBoostingRegressor(),
	KNeighborsRegressor(),
	Ridge()
	]
	for model in models:
	model.fit(X_train, y_train)
	y_train_pred = model.predict(X_train)
	y_test_pred = model.predict(X_test)
	r2_train = r2_score(y_train, y_train_pred)
	r2_test = r2_score(y_test, y_test_pred)
	adj_r2_train = 1 - (1 - r2_train) * ((len(y_train) - 1)/(len(y_train) - X_train.shape[1] - 1))
	adj_r2_test = 1 - (1 - r2_test) * ((len(y_test) - 1)/(len(y_test) - X_test.shape[1] - 1))
	rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
	rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
	results_table.append({
	"Model": model.__class__.__name__,
	"Train R2": round(r2_train, 4),
	"Train Adjusted R2": round(adj_r2_train, 4),
	"Train RMSE": round(rmse_train, 4),
	"Test R2": round(r2_test, 4),
	"Test Adjusted R2": round(adj_r2_test, 4),
	"Test RMSE": round(rmse_test, 4)
	})

	return pd.DataFrame(results_table)

	def visualize(df, x_col, y_col):
	plt.figure(figsize=(8, 6))
	if y_col:
	sns.scatterplot(data=df, x=x_col, y=y_col)
	else:
	sns.histplot(df[x_col], kde=True)
	plt.tight_layout()
	plt.savefig("plot.png")
	plt.close()
	return "plot.png"

	# Gradio UI
	def process(file, url, task, target, x_feature, y_feature):
	df = load_data(file, url)
	eda = basic_eda(df)
	df = impute_missing(df)
	df = detect_outliers(df)
	plot_path = visualize(df, x_feature, y_feature)
	results_df = train_models(df, target, task)
	return eda, plot_path, results_df

	demo = gr.Interface(
	fn=process,
	inputs=[
	gr.File(label="Upload CSV File", file_types=['.csv']), #optional=True),
	gr.Textbox(label="Or enter URL to CSV", placeholder="https://...", lines=1), #optional=True),
	gr.Radio(["classification", "regression"], label="Select Task Type"),
	gr.Textbox(label="Target Column Name"),
	gr.Textbox(label="Feature for X-Axis (for visualization)"),
	gr.Textbox(label="Feature for Y-Axis (optional, for scatter plot)"),
	],
	outputs=[
	gr.JSON(label="Basic EDA"),
	gr.Image(type="filepath", label="Feature Plot"),
	gr.Dataframe(label="Model Performance")
	],
	title="AutoML Dashboard",
	description="Upload a dataset or provide a URL. Select task type, enter target column, choose features to visualize, and evaluate models."
	)

	if __name__ == "__main__":
	demo.launch()