andrewicus commited on
Commit
f8b2bd0
β€’
1 Parent(s): a62d129

full updates

Browse files
imgs/ml-flow1.jpeg ADDED
imgs/ml-flow2.jpeg ADDED
imgs/ml-flow3.jpeg ADDED
imgs/ml-flow4.jpeg ADDED
pages/02 πŸ€– Model Prediction.py CHANGED
@@ -11,7 +11,9 @@ import matplotlib.pyplot as plt
11
  from sklearn.linear_model import LogisticRegression
12
  from sklearn.metrics import classification_report
13
  from codecarbon import EmissionsTracker
 
14
  import time
 
15
 
16
  url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
17
  st.image(url, output_format="PNG", width=300)
@@ -25,7 +27,7 @@ df['Education'] = df['Education'].astype('category').cat.codes
25
  df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
26
  df = df.drop(["Dt_Customer"], axis = 1)
27
  df = df.drop(["ID"], axis = 1)
28
- params = st.multiselect("Select Parameters", df.columns, default = ["Year_Birth"])
29
  model = st.selectbox("Select Model", ["Logistic Regression", "K-Nearest Neighbors", "Decision Tree"])
30
 
31
  if not params:
@@ -39,29 +41,24 @@ else:
39
  model_start_time = time.time()
40
  tracker = EmissionsTracker()
41
  tracker.start()
 
42
  if(model == "Logistic Regression"):
43
- logmodel = LogisticRegression()
44
- logmodel.fit(X_train,y_train)
45
- model_accuracy = logmodel.predict(X_test)
46
  elif(model == "K-Nearest Neighbors"):
47
-
48
- knn = KNeighborsClassifier()
49
- knn.fit(X_train, y_train)
50
- model_accuracy = knn.predict(X_test)
51
- else:
52
- clf = DecisionTreeClassifier(max_depth=3)
53
- clf = clf.fit(X_train,y_train)
54
- model_accuracy = clf.predict(X_test)
55
-
56
  import graphviz
57
  from sklearn.tree import export_graphviz
58
-
59
- # Assuming `clf` and `X` are defined somewhere in your code
60
-
61
  # Your code for exporting the decision tree graph
62
  feature_names = X.columns
63
  feature_cols = X.columns
64
- dot_data = export_graphviz(clf, out_file=None,
65
  feature_names=feature_cols,
66
  class_names=['0', '1'],
67
  filled=True, rounded=True,
@@ -70,12 +67,39 @@ else:
70
  # Display the graph using streamlit_graphviz
71
  st.graphviz_chart(dot_data)
72
 
 
 
 
 
 
 
 
 
 
 
 
73
  model_end_time = time.time()
74
  model_execution_time = model_end_time - model_start_time
75
 
76
-
77
  emissions = tracker.stop()
78
- print(f"Estimated emissions for training the model: {emissions:.4f} kg of CO2")
79
 
80
- st.metric(label = "Accuracy", value = str(round(metrics.accuracy_score(y_test, model_accuracy)*100, 2)) + "%")
81
- st.metric(label = "Execution time:", value = str(model_execution_time) + "s")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  from sklearn.linear_model import LogisticRegression
12
  from sklearn.metrics import classification_report
13
  from codecarbon import EmissionsTracker
14
+ from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report
15
  import time
16
+ from shapash.explainer.smart_explainer import SmartExplainer
17
 
18
  url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
19
  st.image(url, output_format="PNG", width=300)
 
27
  df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
28
  df = df.drop(["Dt_Customer"], axis = 1)
29
  df = df.drop(["ID"], axis = 1)
30
+ params = st.multiselect("Select Parameters", df.columns, default = ["AcceptedCmp5", "Recency", "AcceptedCmp3", "AcceptedCmp1", "NumWebVisitsMonth"])
31
  model = st.selectbox("Select Model", ["Logistic Regression", "K-Nearest Neighbors", "Decision Tree"])
32
 
33
  if not params:
 
41
  model_start_time = time.time()
42
  tracker = EmissionsTracker()
43
  tracker.start()
44
+
45
  if(model == "Logistic Regression"):
46
+ model = LogisticRegression()
47
+ model.fit(X_train,y_train)
 
48
  elif(model == "K-Nearest Neighbors"):
49
+ numNeighbors = st.number_input('N Neighbors', 2, 10)
50
+ model = KNeighborsClassifier(n_neighbors = numNeighbors)
51
+ model.fit(X_train,y_train)
52
+ elif(model == "Decision Tree"):
53
+ maxDepth = st.number_input('Tree Depth', 2, 6)
54
+ model = DecisionTreeClassifier(max_depth=maxDepth)
55
+ model.fit(X_train,y_train)
 
 
56
  import graphviz
57
  from sklearn.tree import export_graphviz
 
 
 
58
  # Your code for exporting the decision tree graph
59
  feature_names = X.columns
60
  feature_cols = X.columns
61
+ dot_data = export_graphviz(model, out_file=None,
62
  feature_names=feature_cols,
63
  class_names=['0', '1'],
64
  filled=True, rounded=True,
 
67
  # Display the graph using streamlit_graphviz
68
  st.graphviz_chart(dot_data)
69
 
70
+
71
+ y_pred = model.predict(X_test)
72
+ st.dataframe(
73
+ pd.DataFrame(
74
+ classification_report(y_test, y_pred, output_dict=True)
75
+ ).transpose()
76
+ )
77
+ f1 = f1_score(y_test, y_pred)
78
+ precision = precision_score(y_test, y_pred, average='binary') # Use average='binary' for binary classification
79
+ model_accuracy = metrics.accuracy_score(y_test, y_pred)
80
+
81
  model_end_time = time.time()
82
  model_execution_time = model_end_time - model_start_time
83
 
 
84
  emissions = tracker.stop()
 
85
 
86
+ st.header("Key Metrics")
87
+
88
+ col1, col2, col3 = st.columns(3)
89
+
90
+ # Metric 1: Accuracy
91
+ col1.metric(label="Accuracy", value=str(round(model_accuracy*100, 2)) + "%")
92
+
93
+ col2.metric(label="F1 Score", value = str(round(f1*100, 2)) + "%")
94
+
95
+ col3.metric(label="Precision", value = str(round(precision*100, 2)) + "%")
96
+
97
+
98
+ col21, col22 = st.columns(2)
99
+ # Metric 2: Execution time
100
+ col21.metric(label="Execution time", value=str(round(model_execution_time, 2)) + "s")
101
+
102
+ # Metric 3: CO2 Emissions
103
+ col22.metric(label="CO2 Emissions", value=str(round(emissions, 2)) + "kg")
104
+
105
+
pages/03 πŸ§‘β€πŸ’» Explainable AI.py CHANGED
@@ -1,9 +1,88 @@
1
  import streamlit as st
2
  import pandas as pd
3
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
6
- st.image(url, output_format="PNG", width=300)
7
 
8
  st.title("Explainable AI")
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  from PIL import Image
4
+ import sklearn.metrics as sk_metrics
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.neighbors import KNeighborsClassifier
7
+ from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
8
+ from sklearn.model_selection import train_test_split # Import train_test_split function
9
+ from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
10
+ import matplotlib.pyplot as plt
11
+ from sklearn.linear_model import LogisticRegression
12
+ from sklearn.metrics import classification_report
13
+ from codecarbon import EmissionsTracker
14
+ from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report
15
+ import time
16
+ from shapash.explainer.smart_explainer import SmartExplainer
17
 
18
  url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
19
+ st.image(url, output_format="PNG", width=300)
20
 
21
  st.title("Explainable AI")
22
 
23
+ df_unclean = pd.read_csv("ifood-data.csv")
24
+ df = df_unclean.dropna()
25
+ df = df[df["Year_Birth"] > 1940]
26
+ df['Education'] = df['Education'].astype('category').cat.codes
27
+ df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
28
+ df = df.drop(["Dt_Customer"], axis = 1)
29
+ df = df.drop(["ID"], axis = 1)
30
+
31
+ params = df.drop('Response', axis = 1).columns
32
+
33
+ X = df.drop(labels = ['Response'], axis = 1)
34
+ X = df[params]
35
+ y = df["Response"]
36
+ X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
37
+ model_start_time = time.time()
38
+ tracker = EmissionsTracker()
39
+ tracker.start()
40
+
41
+ model = DecisionTreeClassifier(max_depth=6)
42
+ model.fit(X_train,y_train)
43
+ import graphviz
44
+ from sklearn.tree import export_graphviz
45
+ # Your code for exporting the decision tree graph
46
+ feature_names = X.columns
47
+ feature_cols = X.columns
48
+ dot_data = export_graphviz(model, out_file=None,
49
+ feature_names=feature_cols,
50
+ class_names=['0', '1'],
51
+ filled=True, rounded=True,
52
+ special_characters=True)
53
+
54
+ # Display the graph using streamlit_graphviz
55
+ st.graphviz_chart(dot_data)
56
+
57
+ y_pred = model.predict(X_test)
58
+ # st.dataframe(
59
+ # pd.DataFrame(
60
+ # classification_report(y_test, y_pred, output_dict=True)
61
+ # ).transpose()
62
+ # )
63
+ f1 = f1_score(y_test, y_pred)
64
+ precision = precision_score(y_test, y_pred, average='binary') # Use average='binary' for binary classification
65
+ model_accuracy = metrics.accuracy_score(y_test, y_pred)
66
+
67
+ model_end_time = time.time()
68
+ model_execution_time = model_end_time - model_start_time
69
+
70
+ emissions = tracker.stop()
71
+
72
+ # Compile SmartExplainer
73
+ xpl = SmartExplainer(model)
74
+ y_pred = pd.Series(y_pred)
75
+ X_test = X_test.reset_index(drop=True)
76
+ xpl.compile(x=X_test, y_pred=y_pred)
77
+
78
+
79
+ st.plotly_chart(xpl.plot.features_importance(), use_container_width = True)
80
+
81
+ import random
82
+ subset = random.choices(X_test.index, k =50)
83
+ st.plotly_chart(xpl.plot.features_importance(selection=subset), use_container_width = True)
84
+
85
+ paramChoice = st.selectbox("Select Parameter", params)
86
+
87
+ st.plotly_chart(xpl.plot.contribution_plot(paramChoice), use_container_width = True)
88
+
pages/04 🦦 MLflow.py CHANGED
@@ -1,245 +1,27 @@
1
-
2
  import pandas as pd
3
- import seaborn as sn
4
- # Commented out IPython magic to ensure Python compatibility.
 
 
5
  from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
6
  from sklearn.model_selection import train_test_split # Import train_test_split function
7
  from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
8
  import matplotlib.pyplot as plt
9
  from sklearn.linear_model import LogisticRegression
10
  from sklearn.metrics import classification_report
11
- # %matplotlib inline
12
-
13
- df = pd.read_csv("ifood-data.csv")
14
-
15
- df.head()
16
-
17
- df['Education'] = df['Education'].astype('category').cat.codes
18
- df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
19
-
20
- df = df.drop(["Dt_Customer"], axis = 1)
21
- df = df.drop(["ID"], axis = 1)
22
-
23
- df = df.dropna()
24
-
25
- df.head()
26
-
27
- plt.figure(figsize=(16, 10))
28
- sns.heatmap(df.corr(), annot=True)
29
- plt.show()
30
-
31
- X = df.drop(labels = ['Response'], axis = 1)
32
- y = df["Response"]
33
- X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
34
-
35
- logmodel = LogisticRegression()
36
-
37
- logmodel.fit(X_train,y_train)
38
-
39
- prediction = logmodel.predict(X_test)
40
-
41
- print(classification_report(y_test,prediction))
42
-
43
- # Create Decision Tree classifer object
44
- clf = DecisionTreeClassifier()
45
-
46
- # Train Decision Tree Classifer
47
- clf = clf.fit(X_train,y_train)
48
-
49
- #Predict the response for test dataset
50
- y_pred = clf.predict(X_test)
51
-
52
- print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
53
-
54
- feature_cols = X.columns
55
- feature_cols
56
-
57
- from sklearn.tree import export_graphviz
58
- feature_names = X.columns
59
- dot_data = export_graphviz(clf, out_file=None,
60
-
61
- feature_names=feature_cols,
62
-
63
- class_names=['0','1'],
64
-
65
- filled=True, rounded=True,
66
-
67
- special_characters=True)
68
-
69
- graph = graphviz.Source(dot_data)
70
- graph
71
-
72
- # Create Decision Tree classifer object
73
- clf = DecisionTreeClassifier(max_depth=3)
74
-
75
- # Train Decision Tree Classifer
76
- clf = clf.fit(X_train,y_train)
77
-
78
- #Predict the response for test dataset
79
- y_pred = clf.predict(X_test)
80
-
81
- # Model Accuracy, how often is the classifier correct?
82
- print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
83
-
84
- import graphviz
85
- from sklearn.tree import export_graphviz
86
- feature_names = X.columns
87
- dot_data = export_graphviz(clf, out_file=None,
88
-
89
- feature_names=feature_cols,
90
-
91
- class_names=['0','1'],
92
-
93
- filled=True, rounded=True,
94
-
95
- special_characters=True)
96
-
97
- graph = graphviz.Source(dot_data)
98
- graph
99
-
100
  from shapash.explainer.smart_explainer import SmartExplainer
101
 
102
- xpl = SmartExplainer(clf)
103
-
104
- y_pred = pd.Series(y_pred)
105
- X_test = X_test.reset_index(drop=True)
106
- xpl.compile(x=X_test, y_pred=y_pred)
107
-
108
- xpl.plot.features_importance()
109
-
110
- from sklearn.neighbors import KNeighborsClassifier
111
-
112
- knn = KNeighborsClassifier()
113
-
114
- knn.fit(X_train, y_train)
115
-
116
- results = knn.predict(X_test)
117
-
118
- print("Accuracy:",metrics.accuracy_score(y_test, results))
119
-
120
- # Import necessary libraries
121
- import numpy as np # a Python library used for working with arrays
122
- import pandas as pd # it allows us to analyze big data and make conclusions based on statistical theories
123
-
124
- from pycaret.datasets import get_data # allows you to easily access and load built-in datasets for machine learning experimentation
125
- from pycaret.classification import * # imports all the classification-related functions
126
- from sklearn.model_selection import train_test_split # This function is commonly used to split a dataset into training and testing subsets.
127
- import mlflow # MLflow is an open-source platform for managing the machine learning lifecycle
128
- from sklearn import metrics as sk_metrics # imports the metrics module from the sklearn library and use various evaluation metrics and scoring functions provided by sci)kit
129
-
130
- # Split data into training and testing sets
131
- loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
132
-
133
- # Initialize PyCaret setup with the training set
134
- cls1 = setup(data = loan_train, target = 'Response')
135
-
136
- # Compare all models and select top 3
137
- top3 = compare_models(include=['lr', 'knn', 'dt'], n_select=3)
138
-
139
- # Log each model into mlflow separately
140
- for i, model in enumerate(top3, 1):
141
- with mlflow.start_run(run_name = f"Model: {model}"):
142
- model_name = "model_" + str(i)
143
-
144
- # Log model
145
- mlflow.sklearn.log_model(model, model_name)
146
-
147
- # Log parameters
148
- params = model.get_params()
149
- for key, value in params.items():
150
- mlflow.log_param(key, value)
151
-
152
- # Predict on the testing set and log metrics
153
- y_pred = predict_model(model, data=loan_test.drop('Response', axis=1))
154
- y_test = loan_test['Response']
155
-
156
- # Calculate metrics
157
- accuracy = sk_metrics.accuracy_score(y_test, y_pred["prediction_label"])
158
- precision = sk_metrics.precision_score(y_test, y_pred["prediction_label"], average='weighted')
159
- recall = sk_metrics.recall_score(y_test, y_pred["prediction_label"], average='weighted')
160
- f1 = sk_metrics.f1_score(y_test, y_pred["prediction_label"], average='weighted')
161
-
162
- # Log metrics
163
- mlflow.log_metric("Accuracy", accuracy)
164
- mlflow.log_metric("Precision", precision)
165
- mlflow.log_metric("Recall", recall)
166
- mlflow.log_metric("F1 Score", f1)
167
-
168
- mlflow.end_run()
169
-
170
-
171
- # Split data into training and testing sets
172
- loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
173
-
174
- # Define the list of max_depth values to try
175
- max_depth_values = [2, 4, 6, 8, 10]
176
-
177
- # Loop over each max_depth value
178
- for depth in max_depth_values:
179
- with mlflow.start_run(run_name=f"Decision Tree (Max Depth: {depth})"):
180
- # Initialize and train the decision tree model
181
- model = DecisionTreeClassifier(max_depth=depth)
182
- model.fit(loan_train.drop('Response', axis=1), loan_train['Response'])
183
-
184
- # Log model parameters
185
- mlflow.log_param("max_depth", depth)
186
-
187
- # Predict on the testing set and log metrics
188
- y_pred = model.predict(loan_test.drop('Response', axis=1))
189
- y_test = loan_test['Response']
190
-
191
- # Calculate metrics
192
- accuracy = sk_metrics.accuracy_score(y_test, y_pred)
193
- precision = sk_metrics.precision_score(y_test, y_pred, average='weighted')
194
- recall = sk_metrics.recall_score(y_test, y_pred, average='weighted')
195
- f1 = sk_metrics.f1_score(y_test, y_pred, average='weighted')
196
-
197
- # Log metrics
198
- mlflow.log_metric("Accuracy", accuracy)
199
- mlflow.log_metric("Precision", precision)
200
- mlflow.log_metric("Recall", recall)
201
- mlflow.log_metric("F1 Score", f1)
202
-
203
- # Log the trained model
204
- mlflow.sklearn.log_model(model, "decision_tree_model")
205
-
206
- mlflow.end_run()
207
-
208
- from sklearn.neighbors import KNeighborsClassifier
209
-
210
- # Split data into training and testing sets
211
- loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
212
-
213
- # Define the list of n_neighbors values to try
214
- n_neighbors_values = [3, 5, 7, 9, 11]
215
-
216
- # Loop over each n_neighbors value
217
- for n_neighbors in n_neighbors_values:
218
- with mlflow.start_run(run_name=f"KNN (n_neighbors: {n_neighbors})"):
219
- # Initialize and train the KNN model
220
- model = KNeighborsClassifier(n_neighbors=n_neighbors)
221
- model.fit(loan_train.drop('Response', axis=1), loan_train['Response'])
222
-
223
- # Log model parameters
224
- mlflow.log_param("n_neighbors", n_neighbors)
225
-
226
- # Predict on the testing set and log metrics
227
- y_pred = model.predict(loan_test.drop('Response', axis=1))
228
- y_test = loan_test['Response']
229
-
230
- # Calculate metrics
231
- accuracy = sk_metrics.accuracy_score(y_test, y_pred)
232
- precision = sk_metrics.precision_score(y_test, y_pred, average='weighted')
233
- recall = sk_metrics.recall_score(y_test, y_pred, average='weighted')
234
- f1 = sk_metrics.f1_score(y_test, y_pred, average='weighted')
235
 
236
- # Log metrics
237
- mlflow.log_metric("Accuracy", accuracy)
238
- mlflow.log_metric("Precision", precision)
239
- mlflow.log_metric("Recall", recall)
240
- mlflow.log_metric("F1 Score", f1)
241
 
242
- # Log the trained model
243
- mlflow.sklearn.log_model(model, "knn_model")
 
 
244
 
245
- mlflow.end_run()
 
1
+ import streamlit as st
2
  import pandas as pd
3
+ from PIL import Image
4
+ import sklearn.metrics as sk_metrics
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.neighbors import KNeighborsClassifier
7
  from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
8
  from sklearn.model_selection import train_test_split # Import train_test_split function
9
  from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
10
  import matplotlib.pyplot as plt
11
  from sklearn.linear_model import LogisticRegression
12
  from sklearn.metrics import classification_report
13
+ from codecarbon import EmissionsTracker
14
+ from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report
15
+ import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from shapash.explainer.smart_explainer import SmartExplainer
17
 
18
+ url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
19
+ st.image(url, output_format="PNG", width=300)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ st.title("MLflow Dashboard")
 
 
 
 
22
 
23
+ st.image('imgs/ml-flow1.jpeg', caption='MLflow Dashbaord')
24
+ st.image('imgs/ml-flow2.jpeg', caption='Comparing Models')
25
+ st.image('imgs/ml-flow3.jpeg', caption='Models sorted by Accuracy')
26
+ st.image('imgs/ml-flow4.jpeg', caption='Winning Model')
27
 
 
requirements.txt CHANGED
@@ -7,4 +7,6 @@ tensorflow
7
  matplotlib
8
  streamlit
9
  seaborn
10
- graphviz
 
 
 
7
  matplotlib
8
  streamlit
9
  seaborn
10
+ graphviz
11
+ shapash
12
+ shapash[report]