from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from xgboost import XGBRegressor, XGBClassifier from sklearn.svm import SVR, SVC from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier from sklearn.linear_model import ElasticNet, BayesianRidge from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, GradientBoostingClassifier, AdaBoostClassifier from sklearn.naive_bayes import GaussianNB from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis from sklearn.linear_model import Ridge, Lasso from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline as SkPipeline import streamlit as st def get_model(task_type, model_name, hyperparams): """Returns the model instance based on user selection with hyperparameters.""" models = { "regression": { # Already existing: "Linear Regression": LinearRegression, "Random Forest Regressor": RandomForestRegressor, "XGBoost Regressor": XGBRegressor, # Additional regression models: "Support Vector Regressor": SVR, "Decision Tree Regressor": DecisionTreeRegressor, "K-Nearest Neighbors Regressor": KNeighborsRegressor, "ElasticNet": ElasticNet, "Gradient Boosting Regressor": GradientBoostingRegressor, "AdaBoost Regressor": AdaBoostRegressor, "Bayesian Ridge": BayesianRidge, "Ridge Regression": Ridge, "Lasso Regression": Lasso , }, "classification": { # Already existing: "Logistic Regression": LogisticRegression, "Random Forest": RandomForestClassifier, "XGBoost": XGBClassifier, # Additional classification models: "Support Vector Classifier": SVC, "Decision Tree Classifier": DecisionTreeClassifier, "K-Nearest Neighbors Classifier": KNeighborsClassifier, "Gradient Boosting Classifier": GradientBoostingClassifier, "AdaBoost Classifier": AdaBoostClassifier, "Gaussian Naive Bayes": GaussianNB, "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis, "Linear Discriminant Analysis": LinearDiscriminantAnalysis } } if task_type in models and model_name in models[task_type]: return models[task_type][model_name](**hyperparams) # Apply hyperparameters else: raise ValueError(f"Invalid model selection: {model_name} for {task_type}") def train_model(df, target_column, task_type, selected_model_name, hyperparams): """Preprocess data, train the selected model with hyperparameters, and return the trained model.""" with st.spinner(" Training model... Please wait!"): # Get the model with hyperparameters model = get_model(task_type, selected_model_name, hyperparams) # Split features and target X = df.drop(columns=[target_column]) y = df[target_column] # Label encode target if classification (for categorical labels) label_encoder = None if task_type == "classification" and y.dtype == "object": from sklearn.preprocessing import LabelEncoder label_encoder = LabelEncoder() y = label_encoder.fit_transform(y) # Train-Test Split (80-20) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Identify numerical and categorical columns num_cols = X.select_dtypes(include=["int64", "float64"]).columns cat_cols = X.select_dtypes(include=["object", "category"]).columns # Preprocessing Pipeline # Numeric pipeline: impute missing values then scale them num_pipeline = SkPipeline([ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler()) ]) # Categorical pipeline: impute missing values then one-hot encode them cat_pipeline = SkPipeline([ ("imputer", SimpleImputer(strategy="most_frequent")), ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False)) ]) preprocessor = ColumnTransformer([ ("num", num_pipeline, num_cols), ("cat", cat_pipeline, cat_cols) ]) pipeline = SkPipeline([ ("preprocessor", preprocessor), ("model", model) ]) # Train Model pipeline.fit(X_train, y_train) # Store test data and metadata in session state st.session_state.X_test = X_test st.session_state.y_test = y_test st.session_state.task_type = task_type st.session_state.label_encoder = label_encoder # Store label encoder for decoding predictions # Reset test results calculation flag when a new model is trained if "test_results_calculated" in st.session_state: st.session_state.test_results_calculated = False # Clear any previous test metrics to avoid using stale data for key in ['test_metrics', 'test_y_pred', 'test_y_test', 'test_cm', 'sampling_message']: if key in st.session_state: del st.session_state[key] # Return trained model + label encoder (needed for decoding predictions if classification) if task_type == "classification": return pipeline, label_encoder else: return pipeline