Atharva Thakur commited on
Commit
ad6eb22
1 Parent(s): 11e054f

Added Mltoolit to Modules and integrated with app.py

Browse files
Files changed (3) hide show
  1. Modules/MLtoolkit.py +215 -0
  2. app.py +48 -1
  3. test.py +61 -0
Modules/MLtoolkit.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn import datasets
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.svm import SVC, SVR
6
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
7
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
8
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
9
+ from sklearn.naive_bayes import GaussianNB
10
+ from sklearn.linear_model import LinearRegression, LogisticRegression
11
+ from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
12
+ from sklearn.decomposition import PCA
13
+ import matplotlib.pyplot as plt
14
+ import seaborn as sns
15
+ import streamlit as st
16
+ from sklearn.preprocessing import LabelEncoder
17
+
18
+
19
+ class MLToolkit:
20
+ def __init__(self, data):
21
+ self.data = data
22
+ self.algorithm = None
23
+ self.algorithm_type = None
24
+ st.subheader("MLtoolkit")
25
+
26
+ def select_algorithm(self):
27
+ self.algorithm = st.selectbox("Select Supervised Learning Algorithm", ("KNN", "SVM", "Decision Tree", "Naive Bayes", "Random Forest", "Linear Regression", "Logistic Regression"))
28
+
29
+ if self.algorithm != 'Linear Regression' and self.algorithm != 'Logistic Regression' and self.algorithm != "Naive Bayes":
30
+ self.algorithm_type = st.selectbox("Select Algorithm Type", ("Classifier", "Regressor"))
31
+ else:
32
+ st.write(f"In {self.algorithm} Classifier and Regressor dosen't exist separately")
33
+ if self.algorithm == "Linear Regression":
34
+ self.algorithm_type = "Regressor"
35
+ st.write("{} only does Regression".format(self.algorithm))
36
+ else:
37
+ self.algorithm_type = "Classifier"
38
+ st.write(f"{self.algorithm} only does Classification")
39
+ return self.algorithm, self.algorithm_type
40
+
41
+ def one_hot_encode_categorical(df, threshold=0.05):
42
+ categorical_columns = df.select_dtypes(include=['object', 'category']).columns
43
+
44
+ unique_ratio = df[categorical_columns].nunique() / len(df)
45
+
46
+ selected_categorical_columns = unique_ratio[unique_ratio < threshold].index
47
+
48
+ df_encoded = pd.get_dummies(df, columns=selected_categorical_columns)
49
+
50
+ return df_encoded
51
+
52
+ def select_features_and_target(self):
53
+ st.write("### Select Features and Target Variable")
54
+
55
+ # Display available columns based on the algorithm
56
+ st.write("#### Available Columns:")
57
+
58
+ if self.algorithm in ["Linear Regression", "Logistic Regression"]:
59
+ numerical_columns = self.data.select_dtypes(include=[np.number]).columns
60
+ selected_features = st.multiselect("Select Numerical Features (X)", numerical_columns)
61
+ else:
62
+ selected_features = st.multiselect("Select Features (X)", self.data.columns)
63
+
64
+ if self.algorithm == "Naive Bayes":
65
+ target_variable = st.selectbox("Select Target Variable (y)", self.data.columns)
66
+ elif self.algorithm == "Linear Regression":
67
+ numerical_columns = self.data.select_dtypes(include=[np.number]).columns
68
+ target_variable = st.selectbox("Select Target Variable (y)", numerical_columns)
69
+ else:
70
+ target_variable = st.selectbox("Select Target Variable (y)", self.data.columns)
71
+
72
+ # Ensure at least one feature and one target variable is selected
73
+ if len(selected_features) < 1 or target_variable is None:
74
+ st.error("Please select at least one feature (X) and a target variable (y).")
75
+ return None, None
76
+
77
+ return self.data[selected_features], self.data[target_variable]
78
+
79
+ def add_parameter_classifier_general(self):
80
+
81
+ params = dict()
82
+
83
+ if self.algorithm == 'SVM':
84
+
85
+ c_regular = st.slider('C (Regularization)', 0.01, 10.0)
86
+ kernel_custom = st.selectbox('Kernel', ('linear', 'poly ', 'rbf', 'sigmoid'))
87
+ params['C'] = c_regular
88
+ params['kernel'] = kernel_custom
89
+
90
+ elif self.algorithm == 'KNN':
91
+
92
+ k_n = st.slider('Number of Neighbors (K)', 1, 20,key="k_n_slider")
93
+ params['K'] = k_n
94
+ weights_custom = st.selectbox('Weights', ('uniform', 'distance'))
95
+ params['weights'] = weights_custom
96
+
97
+ elif self.algorithm == 'Naive Bayes':
98
+ st.info("This is a simple Algorithm. It doesn't have Parameters for Hyper-tuning.")
99
+
100
+ elif self.algorithm == 'Decision Tree':
101
+
102
+ max_depth = st.slider('Max Depth', 2, 17)
103
+ criterion = st.selectbox('Criterion', ('gini', 'entropy'))
104
+ splitter = st.selectbox("Splitter", ("best", "random"))
105
+ params['max_depth'] = max_depth
106
+ params['criterion'] = criterion
107
+ params['splitter'] = splitter
108
+
109
+ try:
110
+ random = st.text_input("Enter Random State")
111
+ params['random_state'] = int(random)
112
+ except:
113
+ params['random_state'] = 4567
114
+
115
+ elif self.algorithm == 'Random Forest':
116
+
117
+ max_depth = st.slider('Max Depth', 2, 17)
118
+ n_estimators = st.slider('Number of Estimators', 1, 90)
119
+ criterion = st.selectbox('Criterion', ('gini', 'entropy', 'log_loss'))
120
+ params['max_depth'] = max_depth
121
+ params['n_estimators'] = n_estimators
122
+ params['criterion'] = criterion
123
+
124
+
125
+ try:
126
+ random = st.text_input("Enter Random State")
127
+ params['random_state'] = int(random)
128
+ except:
129
+ params['random_state'] = 4567
130
+
131
+ else:
132
+
133
+ c_regular = st.slider('C (Regularization)', 0.01, 10.0)
134
+ params['C'] = c_regular
135
+ fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
136
+ params['fit_intercept'] = bool(fit_intercept)
137
+ penalty = st.selectbox("Penalty", ('l2', None))
138
+ params['penalty'] = penalty
139
+ n_jobs = st.selectbox("Number of Jobs", (None, -1))
140
+ params['n_jobs'] = n_jobs
141
+
142
+ return params
143
+
144
+ def add_parameter_regressor(self):
145
+ params = dict()
146
+ if self.algorithm == 'Decision Tree':
147
+ max_depth = st.slider('Max Depth', 2, 17)
148
+ criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
149
+ splitter = st.selectbox("Splitter", ("best", "random"))
150
+ params['max_depth'] = max_depth
151
+ params['criterion'] = criterion
152
+ params['splitter'] = splitter
153
+ try:
154
+ random = st.text_input("Enter Random State")
155
+ params['random_state'] = int(random)
156
+ except:
157
+ params['random_state'] = 4567
158
+ elif self.algorithm == 'Linear Regression':
159
+ fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
160
+ params['fit_intercept'] = bool(fit_intercept)
161
+ n_jobs = st.selectbox("Number of Jobs", (None, -1))
162
+ params['n_jobs'] = n_jobs
163
+ else:
164
+ max_depth = st.slider('Max Depth', 2, 17)
165
+ n_estimators = st.slider('Number of Estimators', 1, 90)
166
+ criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
167
+ params['max_depth'] = max_depth
168
+ params['n_estimators'] = n_estimators
169
+ params['criterion'] = criterion
170
+ try:
171
+ random = st.text_input("Enter Random State")
172
+ params['random_state'] = int(random)
173
+ except:
174
+ params['random_state'] = 4567
175
+ return params
176
+
177
+
178
+ def model_classifier(self, params):
179
+ if self.algorithm == 'KNN':
180
+ return KNeighborsClassifier(n_neighbors=params['K'], weights=params['weights'])
181
+ elif self.algorithm == 'SVM':
182
+ return SVC(C=params['C'], kernel=params['kernel'])
183
+ elif self.algorithm == 'Decision Tree':
184
+ return DecisionTreeClassifier(
185
+ criterion=params['criterion'], splitter=params['splitter'],
186
+ random_state=params['random_state'])
187
+ elif self.algorithm == 'Naive Bayes':
188
+ return GaussianNB()
189
+ elif self.algorithm == 'Random Forest':
190
+ return RandomForestClassifier(n_estimators=params['n_estimators'],
191
+ max_depth=params['max_depth'],
192
+ criterion=params['criterion'],
193
+ random_state=params['random_state'])
194
+ elif self.algorithm == 'Linear Regression':
195
+ return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
196
+ else:
197
+ return LogisticRegression(fit_intercept=params['fit_intercept'],
198
+ penalty=params['penalty'], C=params['C'], n_jobs=params['n_jobs'])
199
+
200
+ def model_regressor(self, params):
201
+ if self.algorithm == 'KNN':
202
+ return KNeighborsRegressor(n_neighbors=params['K'], weights=params['weights'])
203
+ elif self.algorithm == 'SVM':
204
+ return SVR(C=params['C'], kernel=params['kernel'])
205
+ elif self.algorithm == 'Decision Tree':
206
+ return DecisionTreeRegressor(
207
+ criterion=params['criterion'], splitter=params['splitter'],
208
+ random_state=params['random_state'])
209
+ elif self.algorithm == 'Random Forest':
210
+ return RandomForestRegressor(n_estimators=params['n_estimators'],
211
+ max_depth=params['max_depth'],
212
+ criterion=params['criterion'],
213
+ random_state=params['random_state'])
214
+ else:
215
+ return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
app.py CHANGED
@@ -5,10 +5,29 @@ from Modules.data_filter import DataFilter
5
  from Modules.data_transformer import DataTransformer
6
  from Modules.data_visualizer import DataVisualizer
7
  from Modules.data_QA import DataQA
 
 
 
8
  import os
9
  from streamlit_option_menu import option_menu
10
 
 
 
11
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def main():
14
  st.title('Insights 📶')
@@ -23,7 +42,7 @@ def main():
23
  with st.sidebar:
24
  selected = option_menu(
25
  menu_title="Main Menu",
26
- options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "Data Party"])
27
 
28
  # --- DATA LOADER ---
29
  if selected == "Data Loader":
@@ -56,6 +75,34 @@ def main():
56
  data_QA = DataQA(data)
57
  data_QA.ask_csv()
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # --- DATA PARTY ---
60
  if selected == "Data Party":
61
  st.write("To be continued... :)")
 
5
  from Modules.data_transformer import DataTransformer
6
  from Modules.data_visualizer import DataVisualizer
7
  from Modules.data_QA import DataQA
8
+ from Modules.MLtoolkit import MLToolkit
9
+ from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
10
+ from sklearn.model_selection import train_test_split
11
  import os
12
  from streamlit_option_menu import option_menu
13
 
14
+ #---IMPORT---
15
+ import numpy as np
16
  import pandas as pd
17
+ from sklearn import datasets
18
+ from sklearn.model_selection import train_test_split
19
+ from sklearn.svm import SVC, SVR
20
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
21
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
22
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
23
+ from sklearn.naive_bayes import GaussianNB
24
+ from sklearn.linear_model import LinearRegression, LogisticRegression
25
+ from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
26
+ from sklearn.decomposition import PCA
27
+ import matplotlib.pyplot as plt
28
+ import seaborn as sns
29
+ import streamlit as st
30
+ from sklearn.preprocessing import LabelEncoder
31
 
32
  def main():
33
  st.title('Insights 📶')
 
42
  with st.sidebar:
43
  selected = option_menu(
44
  menu_title="Main Menu",
45
+ options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "MLtoolkit", "Data Party"])
46
 
47
  # --- DATA LOADER ---
48
  if selected == "Data Loader":
 
75
  data_QA = DataQA(data)
76
  data_QA.ask_csv()
77
 
78
+ if selected == "MLtoolkit":
79
+ ml_toolkit = MLToolkit(data)
80
+ algorithm, algorithm_type = ml_toolkit.select_algorithm()
81
+ X, Y = ml_toolkit.select_features_and_target()
82
+
83
+ if (algorithm_type == "Regressor") and (algorithm == 'Decision Tree' or algorithm == 'Random Forest' or algorithm_type == "Linear Regression"):
84
+ params = ml_toolkit.add_parameter_regressor()
85
+ else:
86
+ params = ml_toolkit.add_parameter_classifier_general()
87
+
88
+ if algorithm_type == "Regressor":
89
+ algo_model = ml_toolkit.model_regressor(params)
90
+ else:
91
+ algo_model = ml_toolkit.model_classifier(params)
92
+
93
+ x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
94
+
95
+ algo_model.fit(x_train, y_train)
96
+
97
+ predict = algo_model.predict(x_test)
98
+
99
+ if algorithm != 'Linear Regression' and algorithm_type != 'Regressor':
100
+ st.write("Training Accuracy is:", algo_model.score(x_train, y_train) * 100)
101
+ st.write("Testing Accuracy is:", accuracy_score(y_test, predict) * 100)
102
+ else:
103
+ st.write("Mean Squared error is:", mean_squared_error(y_test, predict))
104
+ st.write("Mean Absolute error is:", mean_absolute_error(y_test, predict))
105
+
106
  # --- DATA PARTY ---
107
  if selected == "Data Party":
108
  st.write("To be continued... :)")
test.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from Modules.data_loader import DataLoader
3
+ from Modules.data_analyzer import DataAnalyzer
4
+ from Modules.data_filter import DataFilter
5
+ from Modules.data_transformer import DataTransformer
6
+ from Modules.data_visualizer import DataVisualizer
7
+ from Modules.data_QA import DataQA
8
+ from Modules.MLtoolkit import MLToolkit
9
+ from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
10
+ from sklearn.model_selection import train_test_split
11
+ import os
12
+ from streamlit_option_menu import option_menu
13
+
14
+ #---IMPORT---
15
+ import numpy as np
16
+ import pandas as pd
17
+ from sklearn import datasets
18
+ from sklearn.model_selection import train_test_split
19
+ from sklearn.svm import SVC, SVR
20
+ from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
21
+ from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
22
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
23
+ from sklearn.naive_bayes import GaussianNB
24
+ from sklearn.linear_model import LinearRegression, LogisticRegression
25
+ from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
26
+ from sklearn.decomposition import PCA
27
+ import matplotlib.pyplot as plt
28
+ import seaborn as sns
29
+ import streamlit as st
30
+ from sklearn.preprocessing import LabelEncoder
31
+
32
+
33
+ data = pd.read_csv("data.csv")
34
+
35
+
36
+ ml_toolkit = MLToolkit(data)
37
+ algorithm, algorithm_type = ml_toolkit.select_algorithm()
38
+ X, Y = ml_toolkit.select_features_and_target()
39
+
40
+ if (algorithm_type == "Regressor") and (algorithm == 'Decision Tree' or algorithm == 'Random Forest' or algorithm_type == "Linear Regression"):
41
+ params = ml_toolkit.add_parameter_regressor()
42
+ else:
43
+ params = ml_toolkit.add_parameter_classifier_general()
44
+
45
+ if algorithm_type == "Regressor":
46
+ algo_model = ml_toolkit.model_regressor(params)
47
+ else:
48
+ algo_model = ml_toolkit.model_classifier(params)
49
+
50
+ x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
51
+
52
+ algo_model.fit(x_train, y_train)
53
+
54
+ predict = algo_model.predict(x_test)
55
+
56
+ if algorithm != 'Linear Regression' and algorithm_type != 'Regressor':
57
+ st.write("Training Accuracy is:", algo_model.score(x_train, y_train) * 100)
58
+ st.write("Testing Accuracy is:", accuracy_score(y_test, predict) * 100)
59
+ else:
60
+ st.write("Mean Squared error is:", mean_squared_error(y_test, predict))
61
+ st.write("Mean Absolute error is:", mean_absolute_error(y_test, predict))