Atharva Thakur commited on
Commit
11e054f
1 Parent(s): 3f3b888

First draft of MLtoolkit added

Browse files
Experimentation/Experiments.py DELETED
@@ -1,24 +0,0 @@
1
- import os
2
- import pandas as pd
3
- import streamlit as st
4
- import numpy as np
5
-
6
- def categorical_to_numerical(data):
7
- st.write(data.head())
8
- st.subheader("Convert Categorical to Numerical")
9
- columns_to_encode = st.multiselect('Choose columns to convert', data.select_dtypes(include=object).columns)
10
- if st.button('Convert'):
11
- for col in columns_to_encode:
12
- one_hot_encoded = pd.get_dummies(data[col], prefix=col).astype(int)
13
- data = pd.concat([data, one_hot_encoded], axis=1)
14
- data.drop(col, axis=1, inplace=True)
15
- # data = pd.DataFrame(one_hot_encoded)
16
- st.success("Converted categoricals variables")
17
- # data.to_csv("data.csv", index=False)
18
- st.write(data.head())
19
- st.write(data.describe())
20
- return data
21
-
22
- data = pd.read_csv("data.csv")
23
- data = categorical_to_numerical(data)
24
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Experimentation/MLtoolkit.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn import datasets
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.svm import SVC, SVR
6
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
7
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
8
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
9
+ from sklearn.naive_bayes import GaussianNB
10
+ from sklearn.linear_model import LinearRegression, LogisticRegression
11
+ from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
12
+ from sklearn.decomposition import PCA
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+ import streamlit as st
16
+ from sklearn.preprocessing import LabelEncoder
17
+
18
+ st.title("ML Algorithms on Inbuilt and Kaggle Datasets")
19
+
20
+ algorithm = st.selectbox("Select Supervised Learning Algorithm", ("KNN", "SVM", "Decision Tree", "Naive Bayes", "Random Forest", "Linear Regression", "Logistic Regression"))
21
+
22
+ if algorithm != 'Linear Regression' and algorithm != 'Logistic Regression' and algorithm != "Naive Bayes":
23
+ algorithm_type = st.selectbox("Select Algorithm Type", ("Classifier", "Regressor"))
24
+ else:
25
+ st.write(f"In {algorithm} Classifier and Regressor dosen't exist separately")
26
+ if algorithm == "Linear Regression":
27
+ algorithm_type = "Regressor"
28
+ st.write("{} only does Regression".format(algorithm))
29
+ else:
30
+ algorithm_type = "Classifier"
31
+ st.write(f"{algorithm} only does Classification")
32
+
33
+
34
+
35
+ data = pd.read_csv("test_data.csv")
36
+
37
+ def one_hot_encode_categorical(df, threshold=0.05):
38
+ categorical_columns = df.select_dtypes(include=['object', 'category']).columns
39
+
40
+ unique_ratio = df[categorical_columns].nunique() / len(df)
41
+
42
+ selected_categorical_columns = unique_ratio[unique_ratio < threshold].index
43
+
44
+ df_encoded = pd.get_dummies(df, columns=selected_categorical_columns)
45
+
46
+ return df_encoded
47
+
48
+
49
+ data = one_hot_encode_categorical(data, threshold=0.05)
50
+
51
+ def select_features_and_target(df, algorithm):
52
+ st.write("### Select Features and Target Variable")
53
+
54
+ # Display available columns based on the algorithm
55
+ st.write("#### Available Columns:")
56
+
57
+ if algorithm in ["Linear Regression", "Logistic Regression"]:
58
+ numerical_columns = df.select_dtypes(include=[np.number]).columns
59
+ selected_features = st.multiselect("Select Numerical Features (X)", numerical_columns)
60
+ else:
61
+ selected_features = st.multiselect("Select Features (X)", df.columns)
62
+
63
+ if algorithm == "Naive Bayes":
64
+ target_variable = st.selectbox("Select Target Variable (y)", df.columns)
65
+ elif algorithm == "Linear Regression":
66
+ numerical_columns = df.select_dtypes(include=[np.number]).columns
67
+ target_variable = st.selectbox("Select Target Variable (y)", numerical_columns)
68
+ else:
69
+ target_variable = st.selectbox("Select Target Variable (y)", df.columns)
70
+
71
+ # Ensure at least one feature and one target variable is selected
72
+ if len(selected_features) < 1 or target_variable is None:
73
+ st.error("Please select at least one feature (X) and a target variable (y).")
74
+ return None, None
75
+
76
+ return df[selected_features], df[target_variable]
77
+
78
+ X, Y = select_features_and_target(data,algorithm)
79
+
80
+ def add_parameter_classifier_general(algorithm):
81
+
82
+ params = dict()
83
+
84
+ if algorithm == 'SVM':
85
+
86
+ c_regular = st.slider('C (Regularization)', 0.01, 10.0)
87
+ kernel_custom = st.selectbox('Kernel', ('linear', 'poly ', 'rbf', 'sigmoid'))
88
+ params['C'] = c_regular
89
+ params['kernel'] = kernel_custom
90
+
91
+ elif algorithm == 'KNN':
92
+
93
+ k_n = st.slider('Number of Neighbors (K)', 1, 20,key="k_n_slider")
94
+ params['K'] = k_n
95
+ weights_custom = st.selectbox('Weights', ('uniform', 'distance'))
96
+ params['weights'] = weights_custom
97
+
98
+ elif algorithm == 'Naive Bayes':
99
+ st.info("This is a simple Algorithm. It doesn't have Parameters for Hyper-tuning.")
100
+
101
+ elif algorithm == 'Decision Tree':
102
+
103
+ max_depth = st.slider('Max Depth', 2, 17)
104
+ criterion = st.selectbox('Criterion', ('gini', 'entropy'))
105
+ splitter = st.selectbox("Splitter", ("best", "random"))
106
+ params['max_depth'] = max_depth
107
+ params['criterion'] = criterion
108
+ params['splitter'] = splitter
109
+
110
+ try:
111
+ random = st.text_input("Enter Random State")
112
+ params['random_state'] = int(random)
113
+ except:
114
+ params['random_state'] = 4567
115
+
116
+ elif algorithm == 'Random Forest':
117
+
118
+ max_depth = st.slider('Max Depth', 2, 17)
119
+ n_estimators = st.slider('Number of Estimators', 1, 90)
120
+ criterion = st.selectbox('Criterion', ('gini', 'entropy', 'log_loss'))
121
+ params['max_depth'] = max_depth
122
+ params['n_estimators'] = n_estimators
123
+ params['criterion'] = criterion
124
+
125
+
126
+ try:
127
+ random = st.text_input("Enter Random State")
128
+ params['random_state'] = int(random)
129
+ except:
130
+ params['random_state'] = 4567
131
+
132
+ else:
133
+
134
+ c_regular = st.slider('C (Regularization)', 0.01, 10.0)
135
+ params['C'] = c_regular
136
+ fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
137
+ params['fit_intercept'] = bool(fit_intercept)
138
+ penalty = st.selectbox("Penalty", ('l2', None))
139
+ params['penalty'] = penalty
140
+ n_jobs = st.selectbox("Number of Jobs", (None, -1))
141
+ params['n_jobs'] = n_jobs
142
+
143
+ return params
144
+
145
+ def add_parameter_regressor(algorithm):
146
+ params = dict()
147
+ if algorithm == 'Decision Tree':
148
+ max_depth = st.slider('Max Depth', 2, 17)
149
+ criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
150
+ splitter = st.selectbox("Splitter", ("best", "random"))
151
+ params['max_depth'] = max_depth
152
+ params['criterion'] = criterion
153
+ params['splitter'] = splitter
154
+ try:
155
+ random = st.text_input("Enter Random State")
156
+ params['random_state'] = int(random)
157
+ except:
158
+ params['random_state'] = 4567
159
+ elif algorithm == 'Linear Regression':
160
+ fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
161
+ params['fit_intercept'] = bool(fit_intercept)
162
+ n_jobs = st.selectbox("Number of Jobs", (None, -1))
163
+ params['n_jobs'] = n_jobs
164
+ else:
165
+ max_depth = st.slider('Max Depth', 2, 17)
166
+ n_estimators = st.slider('Number of Estimators', 1, 90)
167
+ criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
168
+ params['max_depth'] = max_depth
169
+ params['n_estimators'] = n_estimators
170
+ params['criterion'] = criterion
171
+ try:
172
+ random = st.text_input("Enter Random State")
173
+ params['random_state'] = int(random)
174
+ except:
175
+ params['random_state'] = 4567
176
+ return params
177
+
178
+ if (algorithm_type == "Regressor") and (algorithm == 'Decision Tree' or algorithm == 'Random Forest' or algorithm_type == "Linear Regression"):
179
+ params = add_parameter_regressor(algorithm)
180
+ else:
181
+ params = add_parameter_classifier_general(algorithm)
182
+
183
+ def model_classifier(algorithm, params):
184
+ if algorithm == 'KNN':
185
+ return KNeighborsClassifier(n_neighbors=params['K'], weights=params['weights'])
186
+ elif algorithm == 'SVM':
187
+ return SVC(C=params['C'], kernel=params['kernel'])
188
+ elif algorithm == 'Decision Tree':
189
+ return DecisionTreeClassifier(
190
+ criterion=params['criterion'], splitter=params['splitter'],
191
+ random_state=params['random_state'])
192
+ elif algorithm == 'Naive Bayes':
193
+ return GaussianNB()
194
+ elif algorithm == 'Random Forest':
195
+ return RandomForestClassifier(n_estimators=params['n_estimators'],
196
+ max_depth=params['max_depth'],
197
+ criterion=params['criterion'],
198
+ random_state=params['random_state'])
199
+ elif algorithm == 'Linear Regression':
200
+ return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
201
+ else:
202
+ return LogisticRegression(fit_intercept=params['fit_intercept'],
203
+ penalty=params['penalty'], C=params['C'], n_jobs=params['n_jobs'])
204
+
205
+ def model_regressor(algorithm, params):
206
+ if algorithm == 'KNN':
207
+ return KNeighborsRegressor(n_neighbors=params['K'], weights=params['weights'])
208
+ elif algorithm == 'SVM':
209
+ return SVR(C=params['C'], kernel=params['kernel'])
210
+ elif algorithm == 'Decision Tree':
211
+ return DecisionTreeRegressor(
212
+ criterion=params['criterion'], splitter=params['splitter'],
213
+ random_state=params['random_state'])
214
+ elif algorithm == 'Random Forest':
215
+ return RandomForestRegressor(n_estimators=params['n_estimators'],
216
+ max_depth=params['max_depth'],
217
+ criterion=params['criterion'],
218
+ random_state=params['random_state'])
219
+ else:
220
+ return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
221
+
222
+
223
+ if algorithm_type == "Regressor":
224
+ algo_model = model_regressor(algorithm, params)
225
+ else:
226
+ algo_model = model_classifier(algorithm, params)
227
+
228
+ x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
229
+
230
+ algo_model.fit(x_train, y_train)
231
+
232
+ predict = algo_model.predict(x_test)
233
+
234
+ if algorithm != 'Linear Regression' and algorithm_type != 'Regressor':
235
+ st.write("Training Accuracy is:", algo_model.score(x_train, y_train) * 100)
236
+ st.write("Testing Accuracy is:", accuracy_score(y_test, predict) * 100)
237
+ else:
238
+ st.write("Mean Squared error is:", mean_squared_error(y_test, predict))
239
+ st.write("Mean Absolute error is:", mean_absolute_error(y_test, predict))