Spaces:
Sleeping
Sleeping
Atharva Thakur
commited on
Commit
•
11e054f
1
Parent(s):
3f3b888
First draft of MLtoolkit added
Browse files- Experimentation/Experiments.py +0 -24
- Experimentation/MLtoolkit.py +239 -0
Experimentation/Experiments.py
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import pandas as pd
|
3 |
-
import streamlit as st
|
4 |
-
import numpy as np
|
5 |
-
|
6 |
-
def categorical_to_numerical(data):
|
7 |
-
st.write(data.head())
|
8 |
-
st.subheader("Convert Categorical to Numerical")
|
9 |
-
columns_to_encode = st.multiselect('Choose columns to convert', data.select_dtypes(include=object).columns)
|
10 |
-
if st.button('Convert'):
|
11 |
-
for col in columns_to_encode:
|
12 |
-
one_hot_encoded = pd.get_dummies(data[col], prefix=col).astype(int)
|
13 |
-
data = pd.concat([data, one_hot_encoded], axis=1)
|
14 |
-
data.drop(col, axis=1, inplace=True)
|
15 |
-
# data = pd.DataFrame(one_hot_encoded)
|
16 |
-
st.success("Converted categoricals variables")
|
17 |
-
# data.to_csv("data.csv", index=False)
|
18 |
-
st.write(data.head())
|
19 |
-
st.write(data.describe())
|
20 |
-
return data
|
21 |
-
|
22 |
-
data = pd.read_csv("data.csv")
|
23 |
-
data = categorical_to_numerical(data)
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Experimentation/MLtoolkit.py
ADDED
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn import datasets
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from sklearn.svm import SVC, SVR
|
6 |
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
7 |
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
8 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
9 |
+
from sklearn.naive_bayes import GaussianNB
|
10 |
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
11 |
+
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
|
12 |
+
from sklearn.decomposition import PCA
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import seaborn as sns
|
15 |
+
import streamlit as st
|
16 |
+
from sklearn.preprocessing import LabelEncoder
|
17 |
+
|
18 |
+
st.title("ML Algorithms on Inbuilt and Kaggle Datasets")
|
19 |
+
|
20 |
+
algorithm = st.selectbox("Select Supervised Learning Algorithm", ("KNN", "SVM", "Decision Tree", "Naive Bayes", "Random Forest", "Linear Regression", "Logistic Regression"))
|
21 |
+
|
22 |
+
if algorithm != 'Linear Regression' and algorithm != 'Logistic Regression' and algorithm != "Naive Bayes":
|
23 |
+
algorithm_type = st.selectbox("Select Algorithm Type", ("Classifier", "Regressor"))
|
24 |
+
else:
|
25 |
+
st.write(f"In {algorithm} Classifier and Regressor dosen't exist separately")
|
26 |
+
if algorithm == "Linear Regression":
|
27 |
+
algorithm_type = "Regressor"
|
28 |
+
st.write("{} only does Regression".format(algorithm))
|
29 |
+
else:
|
30 |
+
algorithm_type = "Classifier"
|
31 |
+
st.write(f"{algorithm} only does Classification")
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
data = pd.read_csv("test_data.csv")
|
36 |
+
|
37 |
+
def one_hot_encode_categorical(df, threshold=0.05):
|
38 |
+
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
|
39 |
+
|
40 |
+
unique_ratio = df[categorical_columns].nunique() / len(df)
|
41 |
+
|
42 |
+
selected_categorical_columns = unique_ratio[unique_ratio < threshold].index
|
43 |
+
|
44 |
+
df_encoded = pd.get_dummies(df, columns=selected_categorical_columns)
|
45 |
+
|
46 |
+
return df_encoded
|
47 |
+
|
48 |
+
|
49 |
+
data = one_hot_encode_categorical(data, threshold=0.05)
|
50 |
+
|
51 |
+
def select_features_and_target(df, algorithm):
|
52 |
+
st.write("### Select Features and Target Variable")
|
53 |
+
|
54 |
+
# Display available columns based on the algorithm
|
55 |
+
st.write("#### Available Columns:")
|
56 |
+
|
57 |
+
if algorithm in ["Linear Regression", "Logistic Regression"]:
|
58 |
+
numerical_columns = df.select_dtypes(include=[np.number]).columns
|
59 |
+
selected_features = st.multiselect("Select Numerical Features (X)", numerical_columns)
|
60 |
+
else:
|
61 |
+
selected_features = st.multiselect("Select Features (X)", df.columns)
|
62 |
+
|
63 |
+
if algorithm == "Naive Bayes":
|
64 |
+
target_variable = st.selectbox("Select Target Variable (y)", df.columns)
|
65 |
+
elif algorithm == "Linear Regression":
|
66 |
+
numerical_columns = df.select_dtypes(include=[np.number]).columns
|
67 |
+
target_variable = st.selectbox("Select Target Variable (y)", numerical_columns)
|
68 |
+
else:
|
69 |
+
target_variable = st.selectbox("Select Target Variable (y)", df.columns)
|
70 |
+
|
71 |
+
# Ensure at least one feature and one target variable is selected
|
72 |
+
if len(selected_features) < 1 or target_variable is None:
|
73 |
+
st.error("Please select at least one feature (X) and a target variable (y).")
|
74 |
+
return None, None
|
75 |
+
|
76 |
+
return df[selected_features], df[target_variable]
|
77 |
+
|
78 |
+
X, Y = select_features_and_target(data,algorithm)
|
79 |
+
|
80 |
+
def add_parameter_classifier_general(algorithm):
|
81 |
+
|
82 |
+
params = dict()
|
83 |
+
|
84 |
+
if algorithm == 'SVM':
|
85 |
+
|
86 |
+
c_regular = st.slider('C (Regularization)', 0.01, 10.0)
|
87 |
+
kernel_custom = st.selectbox('Kernel', ('linear', 'poly ', 'rbf', 'sigmoid'))
|
88 |
+
params['C'] = c_regular
|
89 |
+
params['kernel'] = kernel_custom
|
90 |
+
|
91 |
+
elif algorithm == 'KNN':
|
92 |
+
|
93 |
+
k_n = st.slider('Number of Neighbors (K)', 1, 20,key="k_n_slider")
|
94 |
+
params['K'] = k_n
|
95 |
+
weights_custom = st.selectbox('Weights', ('uniform', 'distance'))
|
96 |
+
params['weights'] = weights_custom
|
97 |
+
|
98 |
+
elif algorithm == 'Naive Bayes':
|
99 |
+
st.info("This is a simple Algorithm. It doesn't have Parameters for Hyper-tuning.")
|
100 |
+
|
101 |
+
elif algorithm == 'Decision Tree':
|
102 |
+
|
103 |
+
max_depth = st.slider('Max Depth', 2, 17)
|
104 |
+
criterion = st.selectbox('Criterion', ('gini', 'entropy'))
|
105 |
+
splitter = st.selectbox("Splitter", ("best", "random"))
|
106 |
+
params['max_depth'] = max_depth
|
107 |
+
params['criterion'] = criterion
|
108 |
+
params['splitter'] = splitter
|
109 |
+
|
110 |
+
try:
|
111 |
+
random = st.text_input("Enter Random State")
|
112 |
+
params['random_state'] = int(random)
|
113 |
+
except:
|
114 |
+
params['random_state'] = 4567
|
115 |
+
|
116 |
+
elif algorithm == 'Random Forest':
|
117 |
+
|
118 |
+
max_depth = st.slider('Max Depth', 2, 17)
|
119 |
+
n_estimators = st.slider('Number of Estimators', 1, 90)
|
120 |
+
criterion = st.selectbox('Criterion', ('gini', 'entropy', 'log_loss'))
|
121 |
+
params['max_depth'] = max_depth
|
122 |
+
params['n_estimators'] = n_estimators
|
123 |
+
params['criterion'] = criterion
|
124 |
+
|
125 |
+
|
126 |
+
try:
|
127 |
+
random = st.text_input("Enter Random State")
|
128 |
+
params['random_state'] = int(random)
|
129 |
+
except:
|
130 |
+
params['random_state'] = 4567
|
131 |
+
|
132 |
+
else:
|
133 |
+
|
134 |
+
c_regular = st.slider('C (Regularization)', 0.01, 10.0)
|
135 |
+
params['C'] = c_regular
|
136 |
+
fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
|
137 |
+
params['fit_intercept'] = bool(fit_intercept)
|
138 |
+
penalty = st.selectbox("Penalty", ('l2', None))
|
139 |
+
params['penalty'] = penalty
|
140 |
+
n_jobs = st.selectbox("Number of Jobs", (None, -1))
|
141 |
+
params['n_jobs'] = n_jobs
|
142 |
+
|
143 |
+
return params
|
144 |
+
|
145 |
+
def add_parameter_regressor(algorithm):
|
146 |
+
params = dict()
|
147 |
+
if algorithm == 'Decision Tree':
|
148 |
+
max_depth = st.slider('Max Depth', 2, 17)
|
149 |
+
criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
|
150 |
+
splitter = st.selectbox("Splitter", ("best", "random"))
|
151 |
+
params['max_depth'] = max_depth
|
152 |
+
params['criterion'] = criterion
|
153 |
+
params['splitter'] = splitter
|
154 |
+
try:
|
155 |
+
random = st.text_input("Enter Random State")
|
156 |
+
params['random_state'] = int(random)
|
157 |
+
except:
|
158 |
+
params['random_state'] = 4567
|
159 |
+
elif algorithm == 'Linear Regression':
|
160 |
+
fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
|
161 |
+
params['fit_intercept'] = bool(fit_intercept)
|
162 |
+
n_jobs = st.selectbox("Number of Jobs", (None, -1))
|
163 |
+
params['n_jobs'] = n_jobs
|
164 |
+
else:
|
165 |
+
max_depth = st.slider('Max Depth', 2, 17)
|
166 |
+
n_estimators = st.slider('Number of Estimators', 1, 90)
|
167 |
+
criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
|
168 |
+
params['max_depth'] = max_depth
|
169 |
+
params['n_estimators'] = n_estimators
|
170 |
+
params['criterion'] = criterion
|
171 |
+
try:
|
172 |
+
random = st.text_input("Enter Random State")
|
173 |
+
params['random_state'] = int(random)
|
174 |
+
except:
|
175 |
+
params['random_state'] = 4567
|
176 |
+
return params
|
177 |
+
|
178 |
+
if (algorithm_type == "Regressor") and (algorithm == 'Decision Tree' or algorithm == 'Random Forest' or algorithm_type == "Linear Regression"):
|
179 |
+
params = add_parameter_regressor(algorithm)
|
180 |
+
else:
|
181 |
+
params = add_parameter_classifier_general(algorithm)
|
182 |
+
|
183 |
+
def model_classifier(algorithm, params):
|
184 |
+
if algorithm == 'KNN':
|
185 |
+
return KNeighborsClassifier(n_neighbors=params['K'], weights=params['weights'])
|
186 |
+
elif algorithm == 'SVM':
|
187 |
+
return SVC(C=params['C'], kernel=params['kernel'])
|
188 |
+
elif algorithm == 'Decision Tree':
|
189 |
+
return DecisionTreeClassifier(
|
190 |
+
criterion=params['criterion'], splitter=params['splitter'],
|
191 |
+
random_state=params['random_state'])
|
192 |
+
elif algorithm == 'Naive Bayes':
|
193 |
+
return GaussianNB()
|
194 |
+
elif algorithm == 'Random Forest':
|
195 |
+
return RandomForestClassifier(n_estimators=params['n_estimators'],
|
196 |
+
max_depth=params['max_depth'],
|
197 |
+
criterion=params['criterion'],
|
198 |
+
random_state=params['random_state'])
|
199 |
+
elif algorithm == 'Linear Regression':
|
200 |
+
return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
|
201 |
+
else:
|
202 |
+
return LogisticRegression(fit_intercept=params['fit_intercept'],
|
203 |
+
penalty=params['penalty'], C=params['C'], n_jobs=params['n_jobs'])
|
204 |
+
|
205 |
+
def model_regressor(algorithm, params):
|
206 |
+
if algorithm == 'KNN':
|
207 |
+
return KNeighborsRegressor(n_neighbors=params['K'], weights=params['weights'])
|
208 |
+
elif algorithm == 'SVM':
|
209 |
+
return SVR(C=params['C'], kernel=params['kernel'])
|
210 |
+
elif algorithm == 'Decision Tree':
|
211 |
+
return DecisionTreeRegressor(
|
212 |
+
criterion=params['criterion'], splitter=params['splitter'],
|
213 |
+
random_state=params['random_state'])
|
214 |
+
elif algorithm == 'Random Forest':
|
215 |
+
return RandomForestRegressor(n_estimators=params['n_estimators'],
|
216 |
+
max_depth=params['max_depth'],
|
217 |
+
criterion=params['criterion'],
|
218 |
+
random_state=params['random_state'])
|
219 |
+
else:
|
220 |
+
return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
|
221 |
+
|
222 |
+
|
223 |
+
if algorithm_type == "Regressor":
|
224 |
+
algo_model = model_regressor(algorithm, params)
|
225 |
+
else:
|
226 |
+
algo_model = model_classifier(algorithm, params)
|
227 |
+
|
228 |
+
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
|
229 |
+
|
230 |
+
algo_model.fit(x_train, y_train)
|
231 |
+
|
232 |
+
predict = algo_model.predict(x_test)
|
233 |
+
|
234 |
+
if algorithm != 'Linear Regression' and algorithm_type != 'Regressor':
|
235 |
+
st.write("Training Accuracy is:", algo_model.score(x_train, y_train) * 100)
|
236 |
+
st.write("Testing Accuracy is:", accuracy_score(y_test, predict) * 100)
|
237 |
+
else:
|
238 |
+
st.write("Mean Squared error is:", mean_squared_error(y_test, predict))
|
239 |
+
st.write("Mean Absolute error is:", mean_absolute_error(y_test, predict))
|