Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
from sklearn.svm import SVC, SVR | |
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor | |
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor | |
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.linear_model import LinearRegression, LogisticRegression | |
import streamlit as st | |
class MLToolkit: | |
def __init__(self, data): | |
self.data = data | |
self.algorithm = None | |
self.algorithm_type = None | |
st.subheader("MLtoolkit") | |
def select_algorithm(self): | |
self.algorithm = st.selectbox("Select Supervised Learning Algorithm", ("KNN", "SVM", "Decision Tree", "Naive Bayes", "Random Forest", "Linear Regression", "Logistic Regression")) | |
if self.algorithm != 'Linear Regression' and self.algorithm != 'Logistic Regression' and self.algorithm != "Naive Bayes": | |
self.algorithm_type = st.selectbox("Select Algorithm Type", ("Classifier", "Regressor")) | |
else: | |
st.write(f"In {self.algorithm} Classifier and Regressor dosen't exist separately") | |
if self.algorithm == "Linear Regression": | |
self.algorithm_type = "Regressor" | |
st.write("{} only does Regression".format(self.algorithm)) | |
else: | |
self.algorithm_type = "Classifier" | |
st.write(f"{self.algorithm} only does Classification") | |
return self.algorithm, self.algorithm_type | |
def one_hot_encode_categorical(df, threshold=0.05): | |
categorical_columns = df.select_dtypes(include=['object', 'category']).columns | |
unique_ratio = df[categorical_columns].nunique() / len(df) | |
selected_categorical_columns = unique_ratio[unique_ratio < threshold].index | |
df_encoded = pd.get_dummies(df, columns=selected_categorical_columns) | |
return df_encoded | |
def select_features_and_target(self): | |
st.write("### Select Features and Target Variable") | |
# Display available columns based on the algorithm | |
st.write("#### Available Columns:") | |
#Choosing the X variables | |
numerical_columns = self.data.select_dtypes(include=[np.number]).columns | |
selected_features = st.multiselect("Select Numerical Features (X)", numerical_columns) | |
#Choosing the y variables | |
if self.algorithm == "Naive Bayes": | |
target_variable = st.selectbox("Select Target Variable (y)", self.data.columns) | |
elif self.algorithm == "Linear Regression" or (self.algorithm_type == "Regressor" and self.algorithm not in ["Linear Regression", "Logistic Regression"]): | |
numerical_columns = self.data.select_dtypes(include=[np.number]).columns | |
target_variable = st.selectbox("Select Target Variable (y)", numerical_columns) | |
else: | |
target_variable = st.selectbox("Select Target Variable (y)", self.data.columns) | |
# Ensure at least one feature and one target variable is selected | |
if len(selected_features) < 1 or target_variable is None: | |
st.error("Please select at least one feature (X) and a target variable (y).") | |
return None, None | |
return self.data[selected_features], self.data[target_variable] | |
def add_parameter_classifier_general(self): | |
params = dict() | |
if self.algorithm == 'SVM': | |
c_regular = st.slider('C (Regularization)', 0.01, 10.0) | |
kernel_custom = st.selectbox('Kernel', ('linear', 'poly ', 'rbf', 'sigmoid')) | |
params['C'] = c_regular | |
params['kernel'] = kernel_custom | |
elif self.algorithm == 'KNN': | |
k_n = st.slider('Number of Neighbors (K)', 1, 20,key="k_n_slider") | |
params['K'] = k_n | |
weights_custom = st.selectbox('Weights', ('uniform', 'distance')) | |
params['weights'] = weights_custom | |
elif self.algorithm == 'Naive Bayes': | |
st.info("This is a simple Algorithm. It doesn't have Parameters for Hyper-tuning.") | |
elif self.algorithm == 'Decision Tree': | |
max_depth = st.slider('Max Depth', 2, 17) | |
criterion = st.selectbox('Criterion', ('gini', 'entropy')) | |
splitter = st.selectbox("Splitter", ("best", "random")) | |
params['max_depth'] = max_depth | |
params['criterion'] = criterion | |
params['splitter'] = splitter | |
try: | |
random = st.text_input("Enter Random State") | |
params['random_state'] = int(random) | |
except: | |
params['random_state'] = 4567 | |
elif self.algorithm == 'Random Forest': | |
max_depth = st.slider('Max Depth', 2, 17) | |
n_estimators = st.slider('Number of Estimators', 1, 90) | |
criterion = st.selectbox('Criterion', ('gini', 'entropy', 'log_loss')) | |
params['max_depth'] = max_depth | |
params['n_estimators'] = n_estimators | |
params['criterion'] = criterion | |
try: | |
random = st.text_input("Enter Random State") | |
params['random_state'] = int(random) | |
except: | |
params['random_state'] = 4567 | |
elif self.algorithm == 'Logistic Regression': | |
c_regular = st.slider('C (Regularization)', 0.01, 10.0) | |
params['C'] = c_regular | |
fit_intercept = st.selectbox("Fit Intercept", ('True', 'False')) | |
params['fit_intercept'] = bool(fit_intercept) | |
penalty = st.selectbox("Penalty", ('l1','l2','elasticnet', None)) | |
params['penalty'] = penalty | |
solver = st.selectbox("Solver", ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga')) | |
params['solver'] = solver | |
max_iter = st.slider('Maximum Iterations', 100, 1000) | |
params['max_iter'] = max_iter | |
n_jobs = st.selectbox("Number of Jobs", (None, -1)) | |
params['n_jobs'] = n_jobs | |
else: | |
c_regular = st.slider('C (Regularization)', 0.01, 10.0) | |
params['C'] = c_regular | |
fit_intercept = st.selectbox("Fit Intercept", ('True', 'False')) | |
params['fit_intercept'] = bool(fit_intercept) | |
penalty = st.selectbox("Penalty", ('l1','l2','elasticnet', None)) | |
params['penalty'] = penalty | |
n_jobs = st.selectbox("Number of Jobs", (None, -1)) | |
params['n_jobs'] = n_jobs | |
return params | |
def add_parameter_regressor(self): | |
params = dict() | |
if self.algorithm == 'Decision Tree': | |
max_depth = st.slider('Max Depth', 2, 17) | |
criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse')) | |
splitter = st.selectbox("Splitter", ("best", "random")) | |
params['max_depth'] = max_depth | |
params['criterion'] = criterion | |
params['splitter'] = splitter | |
try: | |
random = st.text_input("Enter Random State") | |
params['random_state'] = int(random) | |
except: | |
params['random_state'] = 4567 | |
elif self.algorithm == 'Linear Regression': | |
fit_intercept = st.selectbox("Fit Intercept", ('True', 'False')) | |
params['fit_intercept'] = bool(fit_intercept) | |
n_jobs = st.selectbox("Number of Jobs", (None, -1)) | |
params['n_jobs'] = n_jobs | |
else: | |
max_depth = st.slider('Max Depth', 2, 17) | |
n_estimators = st.slider('Number of Estimators', 1, 90) | |
criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse')) | |
params['max_depth'] = max_depth | |
params['n_estimators'] = n_estimators | |
params['criterion'] = criterion | |
try: | |
random = st.text_input("Enter Random State") | |
params['random_state'] = int(random) | |
except: | |
params['random_state'] = 4567 | |
return params | |
def model_classifier(self, params): | |
if self.algorithm == 'KNN': | |
return KNeighborsClassifier(n_neighbors=params['K'], weights=params['weights']) | |
elif self.algorithm == 'SVM': | |
return SVC(C=params['C'], kernel=params['kernel']) | |
elif self.algorithm == 'Decision Tree': | |
return DecisionTreeClassifier( | |
criterion=params['criterion'], splitter=params['splitter'], | |
random_state=params['random_state']) | |
elif self.algorithm == 'Naive Bayes': | |
return GaussianNB() | |
elif self.algorithm == 'Random Forest': | |
return RandomForestClassifier(n_estimators=params['n_estimators'], | |
max_depth=params['max_depth'], | |
criterion=params['criterion'], | |
random_state=params['random_state']) | |
elif self.algorithm == 'Linear Regression': | |
return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs']) | |
else: | |
return LogisticRegression(fit_intercept=params['fit_intercept'], | |
penalty=params['penalty'], | |
C=params['C'], | |
solver=params['solver'], | |
max_iter=params['max_iter'], | |
n_jobs=params['n_jobs']) | |
def model_regressor(self, params): | |
if self.algorithm == 'KNN': | |
return KNeighborsRegressor(n_neighbors=params['K'], weights=params['weights']) | |
elif self.algorithm == 'SVM': | |
return SVR(C=params['C'], kernel=params['kernel']) | |
elif self.algorithm == 'Decision Tree': | |
return DecisionTreeRegressor( | |
criterion=params['criterion'], splitter=params['splitter'], | |
random_state=params['random_state']) | |
elif self.algorithm == 'Random Forest': | |
return RandomForestRegressor(n_estimators=params['n_estimators'], | |
max_depth=params['max_depth'], | |
criterion=params['criterion'], | |
random_state=params['random_state']) | |
else: | |
return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs']) | |