Spaces:
Sleeping
Sleeping
Atharva Thakur
commited on
Commit
•
ad6eb22
1
Parent(s):
11e054f
Added Mltoolit to Modules and integrated with app.py
Browse files- Modules/MLtoolkit.py +215 -0
- app.py +48 -1
- test.py +61 -0
Modules/MLtoolkit.py
ADDED
@@ -0,0 +1,215 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
from sklearn import datasets
|
4 |
+
from sklearn.model_selection import train_test_split
|
5 |
+
from sklearn.svm import SVC, SVR
|
6 |
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
7 |
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
8 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
9 |
+
from sklearn.naive_bayes import GaussianNB
|
10 |
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
11 |
+
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
|
12 |
+
from sklearn.decomposition import PCA
|
13 |
+
import matplotlib.pyplot as plt
|
14 |
+
import seaborn as sns
|
15 |
+
import streamlit as st
|
16 |
+
from sklearn.preprocessing import LabelEncoder
|
17 |
+
|
18 |
+
|
19 |
+
class MLToolkit:
|
20 |
+
def __init__(self, data):
|
21 |
+
self.data = data
|
22 |
+
self.algorithm = None
|
23 |
+
self.algorithm_type = None
|
24 |
+
st.subheader("MLtoolkit")
|
25 |
+
|
26 |
+
def select_algorithm(self):
|
27 |
+
self.algorithm = st.selectbox("Select Supervised Learning Algorithm", ("KNN", "SVM", "Decision Tree", "Naive Bayes", "Random Forest", "Linear Regression", "Logistic Regression"))
|
28 |
+
|
29 |
+
if self.algorithm != 'Linear Regression' and self.algorithm != 'Logistic Regression' and self.algorithm != "Naive Bayes":
|
30 |
+
self.algorithm_type = st.selectbox("Select Algorithm Type", ("Classifier", "Regressor"))
|
31 |
+
else:
|
32 |
+
st.write(f"In {self.algorithm} Classifier and Regressor dosen't exist separately")
|
33 |
+
if self.algorithm == "Linear Regression":
|
34 |
+
self.algorithm_type = "Regressor"
|
35 |
+
st.write("{} only does Regression".format(self.algorithm))
|
36 |
+
else:
|
37 |
+
self.algorithm_type = "Classifier"
|
38 |
+
st.write(f"{self.algorithm} only does Classification")
|
39 |
+
return self.algorithm, self.algorithm_type
|
40 |
+
|
41 |
+
def one_hot_encode_categorical(df, threshold=0.05):
|
42 |
+
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
|
43 |
+
|
44 |
+
unique_ratio = df[categorical_columns].nunique() / len(df)
|
45 |
+
|
46 |
+
selected_categorical_columns = unique_ratio[unique_ratio < threshold].index
|
47 |
+
|
48 |
+
df_encoded = pd.get_dummies(df, columns=selected_categorical_columns)
|
49 |
+
|
50 |
+
return df_encoded
|
51 |
+
|
52 |
+
def select_features_and_target(self):
|
53 |
+
st.write("### Select Features and Target Variable")
|
54 |
+
|
55 |
+
# Display available columns based on the algorithm
|
56 |
+
st.write("#### Available Columns:")
|
57 |
+
|
58 |
+
if self.algorithm in ["Linear Regression", "Logistic Regression"]:
|
59 |
+
numerical_columns = self.data.select_dtypes(include=[np.number]).columns
|
60 |
+
selected_features = st.multiselect("Select Numerical Features (X)", numerical_columns)
|
61 |
+
else:
|
62 |
+
selected_features = st.multiselect("Select Features (X)", self.data.columns)
|
63 |
+
|
64 |
+
if self.algorithm == "Naive Bayes":
|
65 |
+
target_variable = st.selectbox("Select Target Variable (y)", self.data.columns)
|
66 |
+
elif self.algorithm == "Linear Regression":
|
67 |
+
numerical_columns = self.data.select_dtypes(include=[np.number]).columns
|
68 |
+
target_variable = st.selectbox("Select Target Variable (y)", numerical_columns)
|
69 |
+
else:
|
70 |
+
target_variable = st.selectbox("Select Target Variable (y)", self.data.columns)
|
71 |
+
|
72 |
+
# Ensure at least one feature and one target variable is selected
|
73 |
+
if len(selected_features) < 1 or target_variable is None:
|
74 |
+
st.error("Please select at least one feature (X) and a target variable (y).")
|
75 |
+
return None, None
|
76 |
+
|
77 |
+
return self.data[selected_features], self.data[target_variable]
|
78 |
+
|
79 |
+
def add_parameter_classifier_general(self):
|
80 |
+
|
81 |
+
params = dict()
|
82 |
+
|
83 |
+
if self.algorithm == 'SVM':
|
84 |
+
|
85 |
+
c_regular = st.slider('C (Regularization)', 0.01, 10.0)
|
86 |
+
kernel_custom = st.selectbox('Kernel', ('linear', 'poly ', 'rbf', 'sigmoid'))
|
87 |
+
params['C'] = c_regular
|
88 |
+
params['kernel'] = kernel_custom
|
89 |
+
|
90 |
+
elif self.algorithm == 'KNN':
|
91 |
+
|
92 |
+
k_n = st.slider('Number of Neighbors (K)', 1, 20,key="k_n_slider")
|
93 |
+
params['K'] = k_n
|
94 |
+
weights_custom = st.selectbox('Weights', ('uniform', 'distance'))
|
95 |
+
params['weights'] = weights_custom
|
96 |
+
|
97 |
+
elif self.algorithm == 'Naive Bayes':
|
98 |
+
st.info("This is a simple Algorithm. It doesn't have Parameters for Hyper-tuning.")
|
99 |
+
|
100 |
+
elif self.algorithm == 'Decision Tree':
|
101 |
+
|
102 |
+
max_depth = st.slider('Max Depth', 2, 17)
|
103 |
+
criterion = st.selectbox('Criterion', ('gini', 'entropy'))
|
104 |
+
splitter = st.selectbox("Splitter", ("best", "random"))
|
105 |
+
params['max_depth'] = max_depth
|
106 |
+
params['criterion'] = criterion
|
107 |
+
params['splitter'] = splitter
|
108 |
+
|
109 |
+
try:
|
110 |
+
random = st.text_input("Enter Random State")
|
111 |
+
params['random_state'] = int(random)
|
112 |
+
except:
|
113 |
+
params['random_state'] = 4567
|
114 |
+
|
115 |
+
elif self.algorithm == 'Random Forest':
|
116 |
+
|
117 |
+
max_depth = st.slider('Max Depth', 2, 17)
|
118 |
+
n_estimators = st.slider('Number of Estimators', 1, 90)
|
119 |
+
criterion = st.selectbox('Criterion', ('gini', 'entropy', 'log_loss'))
|
120 |
+
params['max_depth'] = max_depth
|
121 |
+
params['n_estimators'] = n_estimators
|
122 |
+
params['criterion'] = criterion
|
123 |
+
|
124 |
+
|
125 |
+
try:
|
126 |
+
random = st.text_input("Enter Random State")
|
127 |
+
params['random_state'] = int(random)
|
128 |
+
except:
|
129 |
+
params['random_state'] = 4567
|
130 |
+
|
131 |
+
else:
|
132 |
+
|
133 |
+
c_regular = st.slider('C (Regularization)', 0.01, 10.0)
|
134 |
+
params['C'] = c_regular
|
135 |
+
fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
|
136 |
+
params['fit_intercept'] = bool(fit_intercept)
|
137 |
+
penalty = st.selectbox("Penalty", ('l2', None))
|
138 |
+
params['penalty'] = penalty
|
139 |
+
n_jobs = st.selectbox("Number of Jobs", (None, -1))
|
140 |
+
params['n_jobs'] = n_jobs
|
141 |
+
|
142 |
+
return params
|
143 |
+
|
144 |
+
def add_parameter_regressor(self):
|
145 |
+
params = dict()
|
146 |
+
if self.algorithm == 'Decision Tree':
|
147 |
+
max_depth = st.slider('Max Depth', 2, 17)
|
148 |
+
criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
|
149 |
+
splitter = st.selectbox("Splitter", ("best", "random"))
|
150 |
+
params['max_depth'] = max_depth
|
151 |
+
params['criterion'] = criterion
|
152 |
+
params['splitter'] = splitter
|
153 |
+
try:
|
154 |
+
random = st.text_input("Enter Random State")
|
155 |
+
params['random_state'] = int(random)
|
156 |
+
except:
|
157 |
+
params['random_state'] = 4567
|
158 |
+
elif self.algorithm == 'Linear Regression':
|
159 |
+
fit_intercept = st.selectbox("Fit Intercept", ('True', 'False'))
|
160 |
+
params['fit_intercept'] = bool(fit_intercept)
|
161 |
+
n_jobs = st.selectbox("Number of Jobs", (None, -1))
|
162 |
+
params['n_jobs'] = n_jobs
|
163 |
+
else:
|
164 |
+
max_depth = st.slider('Max Depth', 2, 17)
|
165 |
+
n_estimators = st.slider('Number of Estimators', 1, 90)
|
166 |
+
criterion = st.selectbox('Criterion', ('absolute_error', 'squared_error', 'poisson', 'friedman_mse'))
|
167 |
+
params['max_depth'] = max_depth
|
168 |
+
params['n_estimators'] = n_estimators
|
169 |
+
params['criterion'] = criterion
|
170 |
+
try:
|
171 |
+
random = st.text_input("Enter Random State")
|
172 |
+
params['random_state'] = int(random)
|
173 |
+
except:
|
174 |
+
params['random_state'] = 4567
|
175 |
+
return params
|
176 |
+
|
177 |
+
|
178 |
+
def model_classifier(self, params):
|
179 |
+
if self.algorithm == 'KNN':
|
180 |
+
return KNeighborsClassifier(n_neighbors=params['K'], weights=params['weights'])
|
181 |
+
elif self.algorithm == 'SVM':
|
182 |
+
return SVC(C=params['C'], kernel=params['kernel'])
|
183 |
+
elif self.algorithm == 'Decision Tree':
|
184 |
+
return DecisionTreeClassifier(
|
185 |
+
criterion=params['criterion'], splitter=params['splitter'],
|
186 |
+
random_state=params['random_state'])
|
187 |
+
elif self.algorithm == 'Naive Bayes':
|
188 |
+
return GaussianNB()
|
189 |
+
elif self.algorithm == 'Random Forest':
|
190 |
+
return RandomForestClassifier(n_estimators=params['n_estimators'],
|
191 |
+
max_depth=params['max_depth'],
|
192 |
+
criterion=params['criterion'],
|
193 |
+
random_state=params['random_state'])
|
194 |
+
elif self.algorithm == 'Linear Regression':
|
195 |
+
return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
|
196 |
+
else:
|
197 |
+
return LogisticRegression(fit_intercept=params['fit_intercept'],
|
198 |
+
penalty=params['penalty'], C=params['C'], n_jobs=params['n_jobs'])
|
199 |
+
|
200 |
+
def model_regressor(self, params):
|
201 |
+
if self.algorithm == 'KNN':
|
202 |
+
return KNeighborsRegressor(n_neighbors=params['K'], weights=params['weights'])
|
203 |
+
elif self.algorithm == 'SVM':
|
204 |
+
return SVR(C=params['C'], kernel=params['kernel'])
|
205 |
+
elif self.algorithm == 'Decision Tree':
|
206 |
+
return DecisionTreeRegressor(
|
207 |
+
criterion=params['criterion'], splitter=params['splitter'],
|
208 |
+
random_state=params['random_state'])
|
209 |
+
elif self.algorithm == 'Random Forest':
|
210 |
+
return RandomForestRegressor(n_estimators=params['n_estimators'],
|
211 |
+
max_depth=params['max_depth'],
|
212 |
+
criterion=params['criterion'],
|
213 |
+
random_state=params['random_state'])
|
214 |
+
else:
|
215 |
+
return LinearRegression(fit_intercept=params['fit_intercept'], n_jobs=params['n_jobs'])
|
app.py
CHANGED
@@ -5,10 +5,29 @@ from Modules.data_filter import DataFilter
|
|
5 |
from Modules.data_transformer import DataTransformer
|
6 |
from Modules.data_visualizer import DataVisualizer
|
7 |
from Modules.data_QA import DataQA
|
|
|
|
|
|
|
8 |
import os
|
9 |
from streamlit_option_menu import option_menu
|
10 |
|
|
|
|
|
11 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def main():
|
14 |
st.title('Insights 📶')
|
@@ -23,7 +42,7 @@ def main():
|
|
23 |
with st.sidebar:
|
24 |
selected = option_menu(
|
25 |
menu_title="Main Menu",
|
26 |
-
options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "Data Party"])
|
27 |
|
28 |
# --- DATA LOADER ---
|
29 |
if selected == "Data Loader":
|
@@ -56,6 +75,34 @@ def main():
|
|
56 |
data_QA = DataQA(data)
|
57 |
data_QA.ask_csv()
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
# --- DATA PARTY ---
|
60 |
if selected == "Data Party":
|
61 |
st.write("To be continued... :)")
|
|
|
5 |
from Modules.data_transformer import DataTransformer
|
6 |
from Modules.data_visualizer import DataVisualizer
|
7 |
from Modules.data_QA import DataQA
|
8 |
+
from Modules.MLtoolkit import MLToolkit
|
9 |
+
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
|
10 |
+
from sklearn.model_selection import train_test_split
|
11 |
import os
|
12 |
from streamlit_option_menu import option_menu
|
13 |
|
14 |
+
#---IMPORT---
|
15 |
+
import numpy as np
|
16 |
import pandas as pd
|
17 |
+
from sklearn import datasets
|
18 |
+
from sklearn.model_selection import train_test_split
|
19 |
+
from sklearn.svm import SVC, SVR
|
20 |
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
21 |
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
22 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
23 |
+
from sklearn.naive_bayes import GaussianNB
|
24 |
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
25 |
+
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
|
26 |
+
from sklearn.decomposition import PCA
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import seaborn as sns
|
29 |
+
import streamlit as st
|
30 |
+
from sklearn.preprocessing import LabelEncoder
|
31 |
|
32 |
def main():
|
33 |
st.title('Insights 📶')
|
|
|
42 |
with st.sidebar:
|
43 |
selected = option_menu(
|
44 |
menu_title="Main Menu",
|
45 |
+
options=["Data Loader", "Exploratory Data Analysis", "Data Cleaning", "Q/A", "MLtoolkit", "Data Party"])
|
46 |
|
47 |
# --- DATA LOADER ---
|
48 |
if selected == "Data Loader":
|
|
|
75 |
data_QA = DataQA(data)
|
76 |
data_QA.ask_csv()
|
77 |
|
78 |
+
if selected == "MLtoolkit":
|
79 |
+
ml_toolkit = MLToolkit(data)
|
80 |
+
algorithm, algorithm_type = ml_toolkit.select_algorithm()
|
81 |
+
X, Y = ml_toolkit.select_features_and_target()
|
82 |
+
|
83 |
+
if (algorithm_type == "Regressor") and (algorithm == 'Decision Tree' or algorithm == 'Random Forest' or algorithm_type == "Linear Regression"):
|
84 |
+
params = ml_toolkit.add_parameter_regressor()
|
85 |
+
else:
|
86 |
+
params = ml_toolkit.add_parameter_classifier_general()
|
87 |
+
|
88 |
+
if algorithm_type == "Regressor":
|
89 |
+
algo_model = ml_toolkit.model_regressor(params)
|
90 |
+
else:
|
91 |
+
algo_model = ml_toolkit.model_classifier(params)
|
92 |
+
|
93 |
+
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
|
94 |
+
|
95 |
+
algo_model.fit(x_train, y_train)
|
96 |
+
|
97 |
+
predict = algo_model.predict(x_test)
|
98 |
+
|
99 |
+
if algorithm != 'Linear Regression' and algorithm_type != 'Regressor':
|
100 |
+
st.write("Training Accuracy is:", algo_model.score(x_train, y_train) * 100)
|
101 |
+
st.write("Testing Accuracy is:", accuracy_score(y_test, predict) * 100)
|
102 |
+
else:
|
103 |
+
st.write("Mean Squared error is:", mean_squared_error(y_test, predict))
|
104 |
+
st.write("Mean Absolute error is:", mean_absolute_error(y_test, predict))
|
105 |
+
|
106 |
# --- DATA PARTY ---
|
107 |
if selected == "Data Party":
|
108 |
st.write("To be continued... :)")
|
test.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from Modules.data_loader import DataLoader
|
3 |
+
from Modules.data_analyzer import DataAnalyzer
|
4 |
+
from Modules.data_filter import DataFilter
|
5 |
+
from Modules.data_transformer import DataTransformer
|
6 |
+
from Modules.data_visualizer import DataVisualizer
|
7 |
+
from Modules.data_QA import DataQA
|
8 |
+
from Modules.MLtoolkit import MLToolkit
|
9 |
+
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
|
10 |
+
from sklearn.model_selection import train_test_split
|
11 |
+
import os
|
12 |
+
from streamlit_option_menu import option_menu
|
13 |
+
|
14 |
+
#---IMPORT---
|
15 |
+
import numpy as np
|
16 |
+
import pandas as pd
|
17 |
+
from sklearn import datasets
|
18 |
+
from sklearn.model_selection import train_test_split
|
19 |
+
from sklearn.svm import SVC, SVR
|
20 |
+
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
|
21 |
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
22 |
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
23 |
+
from sklearn.naive_bayes import GaussianNB
|
24 |
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
25 |
+
from sklearn.metrics import mean_squared_error, accuracy_score, mean_absolute_error
|
26 |
+
from sklearn.decomposition import PCA
|
27 |
+
import matplotlib.pyplot as plt
|
28 |
+
import seaborn as sns
|
29 |
+
import streamlit as st
|
30 |
+
from sklearn.preprocessing import LabelEncoder
|
31 |
+
|
32 |
+
|
33 |
+
data = pd.read_csv("data.csv")
|
34 |
+
|
35 |
+
|
36 |
+
ml_toolkit = MLToolkit(data)
|
37 |
+
algorithm, algorithm_type = ml_toolkit.select_algorithm()
|
38 |
+
X, Y = ml_toolkit.select_features_and_target()
|
39 |
+
|
40 |
+
if (algorithm_type == "Regressor") and (algorithm == 'Decision Tree' or algorithm == 'Random Forest' or algorithm_type == "Linear Regression"):
|
41 |
+
params = ml_toolkit.add_parameter_regressor()
|
42 |
+
else:
|
43 |
+
params = ml_toolkit.add_parameter_classifier_general()
|
44 |
+
|
45 |
+
if algorithm_type == "Regressor":
|
46 |
+
algo_model = ml_toolkit.model_regressor(params)
|
47 |
+
else:
|
48 |
+
algo_model = ml_toolkit.model_classifier(params)
|
49 |
+
|
50 |
+
x_train, x_test, y_train, y_test = train_test_split(X, Y, train_size=0.8)
|
51 |
+
|
52 |
+
algo_model.fit(x_train, y_train)
|
53 |
+
|
54 |
+
predict = algo_model.predict(x_test)
|
55 |
+
|
56 |
+
if algorithm != 'Linear Regression' and algorithm_type != 'Regressor':
|
57 |
+
st.write("Training Accuracy is:", algo_model.score(x_train, y_train) * 100)
|
58 |
+
st.write("Testing Accuracy is:", accuracy_score(y_test, predict) * 100)
|
59 |
+
else:
|
60 |
+
st.write("Mean Squared error is:", mean_squared_error(y_test, predict))
|
61 |
+
st.write("Mean Absolute error is:", mean_absolute_error(y_test, predict))
|