AutoML / ML.py
Gourav18's picture
Upload ML.py
38939c4 verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc,classification_report
from sklearn.impute import SimpleImputer
import openpyxl
import optuna
import joblib
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
st.set_page_config(page_title="ML Model Deployment", layout="wide")
def load_data(file):
try:
if file.name.endswith('.csv'):
data = pd.read_csv(file)
elif file.name.endswith(('.xls', '.xlsx')):
data = pd.read_excel(file)
return data
except Exception as e:
st.error(f"Error loading file: {e}")
return None
def auto_process_data(data):
processed_data = data.copy()
label_encoders = {}
if processed_data.isnull().sum().sum() > 0:
st.info("Automatically handling missing values...")
num_cols = processed_data.select_dtypes(include=['int64', 'float64']).columns
if len(num_cols) > 0:
num_imputer = SimpleImputer(strategy='median')
processed_data[num_cols] = num_imputer.fit_transform(processed_data[num_cols])
cat_cols = processed_data.select_dtypes(include=['object']).columns
if len(cat_cols) > 0:
for col in cat_cols:
if processed_data[col].isnull().any():
most_frequent = processed_data[col].mode()[0]
processed_data[col].fillna(most_frequent, inplace=True)
for column in processed_data.select_dtypes(include=['object']):
label_encoders[column] = LabelEncoder()
processed_data[column] = label_encoders[column].fit_transform(processed_data[column].astype(str))
return processed_data, label_encoders
def get_model_configs():
models = {
'Logistic Regression': {
'pipeline': Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression())
]),
'params': {
'classifier__penalty':['l1','l2'],
'classifier__C':[0.01,0.1,1],
'classifier__max_iter': [100, 200],
'classifier__solver':['liblinear','saga']
}
},
'Support Vector Machine': {
'pipeline': Pipeline([
('scaler', StandardScaler()),
('classifier', SVC(probability=True))
]),
'params': {
'classifier__C': [0.001, 0.1, 1],
'classifier__kernel': ['linear', 'rbf', 'sigmoid'],
'classifier__gamma': ['scale', 'auto', 0.01, 0.1, 1],
'classifier__max_iter':[100,200]
}
},
'Random Forest': {
'pipeline': Pipeline([
('scaler', StandardScaler()),
('classifier', RandomForestClassifier())
]),
'params': {
'classifier__n_estimators':[100,200],
'classifier__max_depth': [None, 10, 20],
'classifier__min_samples_split': [2,5,10],
'classifier__min_samples_leaf':[1,2,4],
}
},
'XgBoost':{
'pipeline':Pipeline([
('scaled',StandardScaler()),
('classifier',XGBClassifier(use_label_encoder=False,eval_metric='logloss'))
]),
'params':{
'classifier__n_estimators': [100, 200],
'classifier__learning_rate': [0.01, 0.05, 0.1],
'classifier__max_depth': [3, 5, 7],
'classifier__min_child_weight': [1, 3, 5],
'classifier__subsample': [0.8, 1.0]
}
}
}
return models
def train_model(X_train, y_train, selected_model, progress_bar=None):
models = get_model_configs()
model_config = models[selected_model]
with st.spinner(f"Training {selected_model}..."):
grid_search = GridSearchCV(
estimator=model_config['pipeline'],
param_grid=model_config['params'],
cv=5,
n_jobs=-1,
verbose=0,
scoring="accuracy"
)
grid_search.fit(X_train, y_train)
if progress_bar:
progress_bar.progress(1.0)
return grid_search.best_estimator_, grid_search.best_score_
def objective(trial, X_train, y_train, model_name):
models = get_model_configs()
model_config = models[model_name]
dataset_size = len(X_train)
cv_folds = 5 if dataset_size > 1000 else (3 if dataset_size > 500 else min(2, dataset_size))
params = {}
if model_name == 'Logistic Regression':
params = {
'classifier__penalty': trial.suggest_categorical('classifier__penalty', ['l1', 'l2']),
'classifier__C': trial.suggest_float('classifier__C', 0.01, 1.0, log=True),
'classifier__solver': trial.suggest_categorical('classifier__solver', ['liblinear', 'saga']),
'classifier__max_iter': trial.suggest_int('classifier__max_iter', 100, 200)
}
elif model_name == 'Support Vector Machine':
params = {
'classifier__C': trial.suggest_float('classifier__C', 0.001, 1.0, log=True),
'classifier__kernel': trial.suggest_categorical('classifier__kernel', ['linear', 'rbf', 'sigmoid']),
'classifier__gamma': trial.suggest_categorical('classifier__gamma', ['scale', 'auto', 0.01, 0.1, 1]),
'classifier__max_iter': trial.suggest_int('classifier__max_iter', 100, 200)
}
elif model_name == 'Random Forest':
params = {
'classifier__n_estimators': trial.suggest_int('classifier__n_estimators', 100, 200),
'classifier__max_depth': trial.suggest_categorical('classifier__max_depth', [None, 10, 20]),
'classifier__min_samples_split': trial.suggest_int('classifier__min_samples_split', 2, 10),
'classifier__min_samples_leaf': trial.suggest_int('classifier__min_samples_leaf', 1, 4)
}
elif model_name == 'XGBoost':
params = {
'classifier__n_estimators': trial.suggest_int('classifier__n_estimators', 100, 300),
'classifier__learning_rate': trial.suggest_float('classifier__learning_rate', 0.01, 0.2, log=True),
'classifier__max_depth': trial.suggest_int('classifier__max_depth', 3, 10),
'classifier__min_child_weight': trial.suggest_int('classifier__min_child_weight', 1, 6)
}
pipeline = model_config['pipeline'].set_params(**params)
pipeline.fit(X_train, y_train)
score = cross_val_score(pipeline, X_train, y_train, cv=cv_folds, scoring="accuracy").mean()
return score
def auto_train(X_train, y_train, X_test, y_test):
models = get_model_configs()
results = {}
best_score = 0
best_model = None
best_model_name = None
st.write("🔄 Training models with Optuna hyperparameter tuning...")
progress_cols = st.columns(len(models))
progress_bars = {model_name: progress_cols[i].progress(0.0) for i, model_name in enumerate(models)}
for model_name in models.keys():
st.write(f"🛠 Training {model_name}...")
# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(lambda trial: objective(trial, X_train, y_train, model_name), n_trials=20)
# Retrieve best parameters and train model
best_params = study.best_params
pipeline = models[model_name]['pipeline'].set_params(**best_params)
pipeline.fit(X_train, y_train)
# Evaluate model
y_pred = pipeline.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
results[model_name] = {
'model': pipeline,
'cv_score': study.best_value,
'test_accuracy': test_accuracy
}
progress_bars[model_name].progress(1.0)
# Track best model
if test_accuracy > best_score:
best_score = test_accuracy
best_model = pipeline
best_model_name = model_name
# Display results
results_df = pd.DataFrame({
'Model': list(results.keys()),
'Cross-Validation Score': [results[model]['cv_score'] for model in results],
'Test Accuracy': [results[model]['test_accuracy'] for model in results]
}).sort_values('Test Accuracy', ascending=False)
st.subheader("📊 Model Performance Comparison")
st.dataframe(results_df)
st.success(f"🏆 Best model: **{best_model_name}** with accuracy: **{best_score:.2%}**")
return best_model, best_model_name
def get_classification_report(y_true, y_pred):
report_dict = classification_report(y_true, y_pred, output_dict=True)
df = pd.DataFrame(report_dict).transpose()
return df
def evaluate_models(X_train, X_test, y_train, y_test):
models =get_model_configs()
results = {}
plt.figure(figsize=(10, 6))
for name, model in models.items():
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')
roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
results[name] = {
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1-score": f1,
"ROC-AUC": roc_auc
}
if y_prob is not None:
fpr, tpr, _ = roc_curve(y_test, y_prob)
plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.2f})")
plt.plot([0, 1], [0, 1], linestyle="--", color="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.show()
fig, axes = plt.subplots(2, 2, figsize=(12, 10))
for ax, (name, model) in zip(axes.ravel(), models.items()):
y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
ax.set_title(f"{name} - Confusion Matrix")
ax.set_xlabel("Predicted Label")
ax.set_ylabel("True Label")
plt.tight_layout()
plt.show()
results_df = pd.DataFrame(results).T
results_df.plot(kind="bar", figsize=(10, 6))
plt.title("Model Comparison")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(title="Metrics")
plt.show()
return results_df
def main():
st.title("🤖 Machine Learning Model Deployment")
st.sidebar.header("Navigation")
page = st.sidebar.radio("Go to", ["Home","Data Upload & Analysis", "Model Training","Visualisation", "Prediction"])
if 'data' not in st.session_state:
st.session_state.data = None
if 'processed_data' not in st.session_state:
st.session_state.processed_data = None
if 'label_encoders' not in st.session_state:
st.session_state.label_encoders = None
if 'model' not in st.session_state:
st.session_state.model = None
if 'features' not in st.session_state:
st.session_state.features = None
if 'target' not in st.session_state:
st.session_state.target = None
if 'model_name' not in st.session_state:
st.session_state.model_name = None
if page=="Home":
st.title("🚀 AutoML: Effortless Machine Learning")
st.markdown(
"""
Welcome to **AutoML**, a powerful yet easy-to-use tool that automates the process of building and evaluating
machine learning models. Whether you're a beginner exploring data or an expert looking for quick model deployment,
AutoML simplifies the entire workflow.
"""
)
st.header("🔹 Features")
st.markdown(
"""
- **Automated Model Selection** – Let AutoML pick the best algorithm for your data.
- **Hyperparameter Tuning** – Optimize model performance without manual tweaking.
- **Data Preprocessing** – Handle missing values, scaling, encoding, and feature engineering.
- **Performance Evaluation** – Compare models with key metrics and visualizations.
- **Model Export** – Save trained models for deployment.
"""
)
st.header("🚀 Get Started")
st.markdown(
"""
1. **Upload your dataset** – Provide a CSV or Excel file with your data.
2. **Select your target variable** – Choose the column to predict.
3. **Let AutoML do the magic!** – Sit back and watch the automation work.
"""
)
st.header("📊 Visual Insights")
st.markdown(
"""
Explore interactive charts and performance metrics to make informed decisions.
Use visualizations to compare model accuracy, precision, recall, and other key statistics.
"""
)
st.success("Start automating your ML workflows now! 🎯")
st.write('''Developed By Gourav Singh,Ankit Yadav,Pushpansh''')
if page == "Data Upload & Analysis":
st.header("📊 Data Upload & Analysis")
uploaded_file = st.file_uploader("Upload your dataset (CSV or Excel)", type=['csv', 'xlsx', 'xls'])
if uploaded_file is not None:
st.session_state.data = load_data(uploaded_file)
if st.session_state.data is not None:
st.session_state.processed_data, st.session_state.label_encoders = auto_process_data(st.session_state.data)
st.success("Data loaded and automatically processed!")
st.subheader("Dataset Overview")
col1, col2, col3 = st.columns(3)
with col1:
st.info(f"Number of rows: {st.session_state.data.shape[0]}")
with col2:
st.info(f"Number of columns: {st.session_state.data.shape[1]}")
with col3:
missing_values = st.session_state.data.isnull().sum().sum()
st.info(f"Missing values: {missing_values} (Automatically handled)")
st.subheader("Original Data Preview")
st.dataframe(st.session_state.data.head())
st.subheader("Processed Data Preview")
st.dataframe(st.session_state.processed_data.head())
st.subheader("Statistical Description")
st.dataframe(st.session_state.processed_data.describe())
st.subheader("Correlation Heatmap")
fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(st.session_state.processed_data.corr(), annot=True, cmap='coolwarm', ax=ax)
st.pyplot(fig)
elif page == "Model Training":
st.header("🎯 Auto Model Training")
if st.session_state.processed_data is None:
st.warning("Please upload and process your data first!")
return
st.subheader("Select Features and Target")
columns = st.session_state.processed_data.columns.tolist()
st.session_state.features = st.multiselect("Select features", columns, default=columns[:-1])
st.session_state.target = st.selectbox("Select target variable", columns)
if st.button("Auto Train Models"):
if len(st.session_state.features) > 0 and st.session_state.target:
X = st.session_state.processed_data[st.session_state.features]
y = st.session_state.processed_data[st.session_state.target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
st.session_state.model, st.session_state.model_name = auto_train(X_train, y_train, X_test, y_test)
y_pred = st.session_state.model.predict(X_test)
st.subheader("Best Model Performance")
accuracy = accuracy_score(y_test, y_pred)
st.metric("Accuracy", f"{accuracy:.2%}")
st.text("Classification Report:")
df_report = get_classification_report(y_test, y_pred)
st.dataframe(df_report)
if st.session_state.model_name == "Random Forest":
st.subheader("Feature Importance")
importance_df = pd.DataFrame({
'Feature': st.session_state.features,
'Importance': st.session_state.model.named_steps['classifier'].feature_importances_
}).sort_values('Importance', ascending=False)
fig = px.bar(importance_df, x='Feature', y='Importance',
title='Feature Importance Plot')
st.plotly_chart(fig)
model_data = {
'model': st.session_state.model,
'model_name': st.session_state.model_name,
'label_encoders': st.session_state.label_encoders,
'features': st.session_state.features,
'target': st.session_state.target
}
joblib.dump(model_data, 'model_data.joblib')
st.download_button(
label="Download trained model",
data=open('model_data.joblib', 'rb'),
file_name='model_data.joblib',
mime='application/octet-stream'
)
elif page=="Visualisation":
st.header("Model Visualisation")
if st.session_state.model is None:
st.warning("Please train a model first!")
return
if st.session_state.processed_data is not None and st.session_state.features and st.session_state.target:
X = st.session_state.processed_data[st.session_state.features]
y = st.session_state.processed_data[st.session_state.target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create visualization options
viz_option = st.selectbox(
"Select visualization type",
["Model Comparison", "ROC Curves", "Confusion Matrix"]
)
if viz_option == "Model Comparison":
st.subheader("Model Performance Metrics")
# Train all models to compare
models = get_model_configs()
results = {}
progress_bar = st.progress(0)
progress_text = st.empty()
for i, (name, model_config) in enumerate(models.items()):
progress_text.text(f"Training {name}...")
pipeline = model_config['pipeline']
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
y_prob = pipeline.predict_proba(X_test)[:, 1] if hasattr(pipeline, "predict_proba") else None
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')
roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else None
results[name] = {
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1-score": f1,
"ROC-AUC": roc_auc
}
progress_bar.progress((i + 1) / len(models))
progress_text.empty()
results_df = pd.DataFrame(results).T
st.dataframe(results_df)
fig = px.bar(
results_df.reset_index().melt(id_vars='index', var_name='Metric', value_name='Score'),
x='index', y='Score', color='Metric',
barmode='group',
title='Model Comparison',
labels={'index': 'Model'}
)
st.plotly_chart(fig)
elif viz_option == "ROC Curves":
st.subheader("ROC Curves")
models = get_model_configs()
fig = plt.figure(figsize=(10, 6))
for name, model_config in models.items():
pipeline = model_config['pipeline']
pipeline.fit(X_train, y_train)
if hasattr(pipeline, "predict_proba"):
y_prob = pipeline.predict_proba(X_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curves')
plt.legend(loc="lower right")
st.pyplot(fig)
elif viz_option == "Confusion Matrix":
st.subheader("Confusion Matrices")
models = get_model_configs()
if len(models) > 4:
st.warning("Showing confusion matrices for the first 4 models")
model_items = list(models.items())[:4]
else:
model_items = list(models.items())
num_models = len(model_items)
cols = 2
rows = (num_models + 1) // 2
fig, axes = plt.subplots(rows, cols, figsize=(12, 10))
axes = axes.flatten() if num_models > 1 else [axes]
for i, (name, model_config) in enumerate(model_items):
pipeline = model_config['pipeline']
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=axes[i])
axes[i].set_title(f"{name} - Confusion Matrix")
axes[i].set_xlabel("Predicted")
axes[i].set_ylabel("Actual")
for j in range(num_models, len(axes)):
fig.delaxes(axes[j])
plt.tight_layout()
st.pyplot(fig)
st.subheader("Current Model Performance")
best_model_pred = st.session_state.model.predict(X_test)
st.metric("Accuracy", f"{accuracy_score(y_test, best_model_pred):.2%}")
col1, col2 = st.columns(2)
with col1:
st.metric("Precision", f"{precision_score(y_test, best_model_pred):.2%}")
st.metric("F1 Score", f"{f1_score(y_test, best_model_pred):.2%}")
with col2:
st.metric("Recall", f"{recall_score(y_test, best_model_pred):.2%}")
if hasattr(st.session_state.model, "predict_proba"):
best_proba = st.session_state.model.predict_proba(X_test)[:, 1]
st.metric("AUC", f"{roc_auc_score(y_test, best_proba):.2%}")
else:
st.warning("Please load and preprocess your dataset before running evaluation.")
elif page == "Prediction":
st.header("🎲 Make Predictions")
if st.session_state.model is None:
st.warning("Please train a model first!")
return
st.subheader("Enter Feature Values")
st.info(f"Using best model: {st.session_state.model_name}")
input_data = {}
for feature in st.session_state.features:
if feature in st.session_state.label_encoders:
options = st.session_state.label_encoders[feature].classes_
value = st.selectbox(f"Select {feature}", options)
input_data[feature] = st.session_state.label_encoders[feature].transform([value])[0]
else:
input_data[feature] = st.number_input(f"Enter value for {feature}", value=0.0)
if st.button("Predict"):
input_df = pd.DataFrame([input_data])
prediction = st.session_state.model.predict(input_df)
if st.session_state.target in st.session_state.label_encoders:
original_prediction = st.session_state.label_encoders[st.session_state.target].inverse_transform(prediction)
st.success(f"Predicted {st.session_state.target}: {original_prediction[0]}")
else:
st.success(f"Predicted {st.session_state.target}: {prediction[0]}")
proba = st.session_state.model.predict_proba(input_df)
st.subheader("Prediction Probability")
if st.session_state.target in st.session_state.label_encoders:
classes = st.session_state.label_encoders[st.session_state.target].classes_
else:
classes = st.session_state.model.classes_
proba_df = pd.DataFrame(
proba,
columns=classes
)
st.dataframe(proba_df)
if __name__ == "__main__":
main()