Spaces:

NYU-DS-4-Everyone
/

Final

Sleeping

App Files Files Community

andrewicus commited on May 2

Commit

f8b2bd0

•

1 Parent(s): a62d129

full updates

Browse files

Files changed (8) hide show

imgs/ml-flow1.jpeg +0 -0
imgs/ml-flow2.jpeg +0 -0
imgs/ml-flow3.jpeg +0 -0
imgs/ml-flow4.jpeg +0 -0
pages/02 🤖 Model Prediction.py +45 -21
pages/03 🧑‍💻 Explainable AI.py +80 -1
pages/04 🦦 MLflow.py +15 -233
requirements.txt +3 -1

imgs/ml-flow1.jpeg ADDED Viewed

imgs/ml-flow2.jpeg ADDED Viewed

imgs/ml-flow3.jpeg ADDED Viewed

imgs/ml-flow4.jpeg ADDED Viewed

pages/02 🤖 Model Prediction.py CHANGED Viewed

@@ -11,7 +11,9 @@ import matplotlib.pyplot as plt
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import classification_report
 from codecarbon import EmissionsTracker
 import time
 url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
 st.image(url,  output_format="PNG", width=300)
@@ -25,7 +27,7 @@ df['Education'] = df['Education'].astype('category').cat.codes
 df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
 df = df.drop(["Dt_Customer"], axis = 1)
 df = df.drop(["ID"], axis = 1)
-params = st.multiselect("Select Parameters", df.columns, default = ["Year_Birth"])
 model = st.selectbox("Select Model", ["Logistic Regression", "K-Nearest Neighbors", "Decision Tree"])
 if not params:
@@ -39,29 +41,24 @@ else:
     model_start_time = time.time()
     tracker = EmissionsTracker()
     tracker.start()
     if(model == "Logistic Regression"):
-        logmodel = LogisticRegression()
-        logmodel.fit(X_train,y_train)
-        model_accuracy = logmodel.predict(X_test)
     elif(model == "K-Nearest Neighbors"):
-        knn = KNeighborsClassifier()
-        knn.fit(X_train, y_train)
-        model_accuracy = knn.predict(X_test)
-    else:
-        clf = DecisionTreeClassifier(max_depth=3)
-        clf = clf.fit(X_train,y_train)
-        model_accuracy = clf.predict(X_test)
         import graphviz
         from sklearn.tree import export_graphviz
-        # Assuming `clf` and `X` are defined somewhere in your code
         # Your code for exporting the decision tree graph
         feature_names = X.columns
         feature_cols = X.columns
-        dot_data = export_graphviz(clf, out_file=None,
                                 feature_names=feature_cols,
                                 class_names=['0', '1'],
                                 filled=True, rounded=True,
@@ -70,12 +67,39 @@ else:
         # Display the graph using streamlit_graphviz
         st.graphviz_chart(dot_data)
     model_end_time = time.time()
     model_execution_time = model_end_time - model_start_time
     emissions = tracker.stop()
-    print(f"Estimated emissions for training the model: {emissions:.4f} kg of CO2")
-    st.metric(label = "Accuracy", value = str(round(metrics.accuracy_score(y_test, model_accuracy)*100, 2)) + "%")
-    st.metric(label = "Execution time:", value = str(model_execution_time) + "s")

 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import classification_report
 from codecarbon import EmissionsTracker
+from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report
 import time
+from shapash.explainer.smart_explainer import SmartExplainer
 url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
 st.image(url,  output_format="PNG", width=300)
 df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
 df = df.drop(["Dt_Customer"], axis = 1)
 df = df.drop(["ID"], axis = 1)
+params = st.multiselect("Select Parameters", df.columns, default = ["AcceptedCmp5", "Recency", "AcceptedCmp3", "AcceptedCmp1", "NumWebVisitsMonth"])
 model = st.selectbox("Select Model", ["Logistic Regression", "K-Nearest Neighbors", "Decision Tree"])
 if not params:
     model_start_time = time.time()
     tracker = EmissionsTracker()
     tracker.start()
     if(model == "Logistic Regression"):
+        model = LogisticRegression()
+        model.fit(X_train,y_train)
     elif(model == "K-Nearest Neighbors"):
+        numNeighbors = st.number_input('N Neighbors', 2, 10)
+        model = KNeighborsClassifier(n_neighbors = numNeighbors)
+        model.fit(X_train,y_train)
+    elif(model == "Decision Tree"):
+        maxDepth = st.number_input('Tree Depth', 2, 6)
+        model = DecisionTreeClassifier(max_depth=maxDepth)
+        model.fit(X_train,y_train)
         import graphviz
         from sklearn.tree import export_graphviz
         # Your code for exporting the decision tree graph
         feature_names = X.columns
         feature_cols = X.columns
+        dot_data = export_graphviz(model, out_file=None,
                                 feature_names=feature_cols,
                                 class_names=['0', '1'],
                                 filled=True, rounded=True,
         # Display the graph using streamlit_graphviz
         st.graphviz_chart(dot_data)
+    y_pred = model.predict(X_test)
+    st.dataframe(
+    pd.DataFrame(
+        classification_report(y_test, y_pred, output_dict=True)
+    ).transpose()
+    )
+    f1 = f1_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred, average='binary')  # Use average='binary' for binary classification
+    model_accuracy = metrics.accuracy_score(y_test, y_pred)
     model_end_time = time.time()
     model_execution_time = model_end_time - model_start_time
     emissions = tracker.stop()
+    st.header("Key Metrics")
+    col1, col2, col3 = st.columns(3)
+    # Metric 1: Accuracy
+    col1.metric(label="Accuracy", value=str(round(model_accuracy*100, 2)) + "%")
+    col2.metric(label="F1 Score", value = str(round(f1*100, 2)) + "%")
+    col3.metric(label="Precision", value = str(round(precision*100, 2)) + "%")
+    col21, col22 = st.columns(2)
+    # Metric 2: Execution time
+    col21.metric(label="Execution time", value=str(round(model_execution_time, 2)) + "s")
+    # Metric 3: CO2 Emissions
+    col22.metric(label="CO2 Emissions", value=str(round(emissions, 2)) + "kg")

pages/03 🧑‍💻 Explainable AI.py CHANGED Viewed

@@ -1,9 +1,88 @@
 import streamlit as st
 import pandas as pd
 from PIL import Image
 url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
-st.image(url,  output_format="PNG", width=300)
 st.title("Explainable AI")

 import streamlit as st
 import pandas as pd
 from PIL import Image
+import sklearn.metrics as sk_metrics
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
+from sklearn.model_selection import train_test_split # Import train_test_split function
+from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
+import matplotlib.pyplot as plt
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import classification_report
+from codecarbon import EmissionsTracker
+from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report
+import time
+from shapash.explainer.smart_explainer import SmartExplainer
 url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
+st.image(url, output_format="PNG", width=300)
 st.title("Explainable AI")
+df_unclean = pd.read_csv("ifood-data.csv")
+df = df_unclean.dropna()
+df = df[df["Year_Birth"] > 1940]
+df['Education'] = df['Education'].astype('category').cat.codes
+df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
+df = df.drop(["Dt_Customer"], axis = 1)
+df = df.drop(["ID"], axis = 1)
+params = df.drop('Response', axis = 1).columns
+X = df.drop(labels = ['Response'], axis = 1)
+X = df[params]
+y = df["Response"]
+X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
+model_start_time = time.time()
+tracker = EmissionsTracker()
+tracker.start()
+model = DecisionTreeClassifier(max_depth=6)
+model.fit(X_train,y_train)
+import graphviz
+from sklearn.tree import export_graphviz
+# Your code for exporting the decision tree graph
+feature_names = X.columns
+feature_cols = X.columns
+dot_data = export_graphviz(model, out_file=None,
+                        feature_names=feature_cols,
+                        class_names=['0', '1'],
+                        filled=True, rounded=True,
+                        special_characters=True)
+# Display the graph using streamlit_graphviz
+st.graphviz_chart(dot_data)
+y_pred = model.predict(X_test)
+# st.dataframe(
+# pd.DataFrame(
+#     classification_report(y_test, y_pred, output_dict=True)
+# ).transpose()
+# )
+f1 = f1_score(y_test, y_pred)
+precision = precision_score(y_test, y_pred, average='binary')  # Use average='binary' for binary classification
+model_accuracy = metrics.accuracy_score(y_test, y_pred)
+model_end_time = time.time()
+model_execution_time = model_end_time - model_start_time
+emissions = tracker.stop()
+# Compile SmartExplainer
+xpl = SmartExplainer(model)
+y_pred = pd.Series(y_pred)
+X_test = X_test.reset_index(drop=True)
+xpl.compile(x=X_test, y_pred=y_pred)
+st.plotly_chart(xpl.plot.features_importance(), use_container_width = True)
+import random
+subset = random.choices(X_test.index, k =50)
+st.plotly_chart(xpl.plot.features_importance(selection=subset), use_container_width = True)
+paramChoice = st.selectbox("Select Parameter", params)
+st.plotly_chart(xpl.plot.contribution_plot(paramChoice), use_container_width = True)

pages/04 🦦 MLflow.py CHANGED Viewed

@@ -1,245 +1,27 @@
 import pandas as pd
-import seaborn as sn
-# Commented out IPython magic to ensure Python compatibility.
 from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
 from sklearn.model_selection import train_test_split # Import train_test_split function
 from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
 import matplotlib.pyplot as plt
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import classification_report
-# %matplotlib inline
-df = pd.read_csv("ifood-data.csv")
-df.head()
-df['Education'] = df['Education'].astype('category').cat.codes
-df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
-df = df.drop(["Dt_Customer"], axis = 1)
-df = df.drop(["ID"], axis = 1)
-df = df.dropna()
-df.head()
-plt.figure(figsize=(16, 10))
-sns.heatmap(df.corr(), annot=True)
-plt.show()
-X = df.drop(labels = ['Response'], axis = 1)
-y = df["Response"]
-X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
-logmodel = LogisticRegression()
-logmodel.fit(X_train,y_train)
-prediction = logmodel.predict(X_test)
-print(classification_report(y_test,prediction))
-# Create Decision Tree classifer object
-clf = DecisionTreeClassifier()
-# Train Decision Tree Classifer
-clf = clf.fit(X_train,y_train)
-#Predict the response for test dataset
-y_pred = clf.predict(X_test)
-print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
-feature_cols = X.columns
-feature_cols
-from sklearn.tree import export_graphviz
-feature_names = X.columns
-dot_data = export_graphviz(clf, out_file=None,
-                         feature_names=feature_cols,
-                         class_names=['0','1'],
-                         filled=True, rounded=True,
-                         special_characters=True)
-graph = graphviz.Source(dot_data)
-graph
-# Create Decision Tree classifer object
-clf = DecisionTreeClassifier(max_depth=3)
-# Train Decision Tree Classifer
-clf = clf.fit(X_train,y_train)
-#Predict the response for test dataset
-y_pred = clf.predict(X_test)
-# Model Accuracy, how often is the classifier correct?
-print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
-import graphviz
-from sklearn.tree import export_graphviz
-feature_names = X.columns
-dot_data = export_graphviz(clf, out_file=None,
-                         feature_names=feature_cols,
-                         class_names=['0','1'],
-                         filled=True, rounded=True,
-                         special_characters=True)
-graph = graphviz.Source(dot_data)
-graph
 from shapash.explainer.smart_explainer import SmartExplainer
-xpl = SmartExplainer(clf)
-y_pred = pd.Series(y_pred)
-X_test = X_test.reset_index(drop=True)
-xpl.compile(x=X_test, y_pred=y_pred)
-xpl.plot.features_importance()
-from sklearn.neighbors import KNeighborsClassifier
-knn = KNeighborsClassifier()
-knn.fit(X_train, y_train)
-results = knn.predict(X_test)
-print("Accuracy:",metrics.accuracy_score(y_test, results))
-# Import necessary libraries
-import numpy as np # a Python library used for working with arrays
-import pandas as pd # it allows us to analyze big data and make conclusions based on statistical theories
-from pycaret.datasets import get_data #  allows you to easily access and load built-in datasets for machine learning experimentation
-from pycaret.classification import * #  imports all the classification-related functions
-from sklearn.model_selection import train_test_split # This function is commonly used to split a dataset into training and testing subsets.
-import mlflow # MLflow is an open-source platform for managing the machine learning lifecycle
-from sklearn import metrics as sk_metrics # imports the metrics module from the sklearn library and use various evaluation metrics and scoring functions provided by sci)kit
-# Split data into training and testing sets
-loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
-# Initialize PyCaret setup with the training set
-cls1 = setup(data = loan_train, target = 'Response')
-# Compare all models and select top 3
-top3 = compare_models(include=['lr', 'knn', 'dt'], n_select=3)
-# Log each model into mlflow separately
-for i, model in enumerate(top3, 1):
-    with mlflow.start_run(run_name = f"Model: {model}"):
-        model_name = "model_" + str(i)
-        # Log model
-        mlflow.sklearn.log_model(model, model_name)
-        # Log parameters
-        params = model.get_params()
-        for key, value in params.items():
-            mlflow.log_param(key, value)
-        # Predict on the testing set and log metrics
-        y_pred = predict_model(model, data=loan_test.drop('Response', axis=1))
-        y_test = loan_test['Response']
-        # Calculate metrics
-        accuracy = sk_metrics.accuracy_score(y_test, y_pred["prediction_label"])
-        precision = sk_metrics.precision_score(y_test, y_pred["prediction_label"], average='weighted')
-        recall = sk_metrics.recall_score(y_test, y_pred["prediction_label"], average='weighted')
-        f1 = sk_metrics.f1_score(y_test, y_pred["prediction_label"], average='weighted')
-        # Log metrics
-        mlflow.log_metric("Accuracy", accuracy)
-        mlflow.log_metric("Precision", precision)
-        mlflow.log_metric("Recall", recall)
-        mlflow.log_metric("F1 Score", f1)
-        mlflow.end_run()
-# Split data into training and testing sets
-loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
-# Define the list of max_depth values to try
-max_depth_values = [2, 4, 6, 8, 10]
-# Loop over each max_depth value
-for depth in max_depth_values:
-    with mlflow.start_run(run_name=f"Decision Tree (Max Depth: {depth})"):
-        # Initialize and train the decision tree model
-        model = DecisionTreeClassifier(max_depth=depth)
-        model.fit(loan_train.drop('Response', axis=1), loan_train['Response'])
-        # Log model parameters
-        mlflow.log_param("max_depth", depth)
-        # Predict on the testing set and log metrics
-        y_pred = model.predict(loan_test.drop('Response', axis=1))
-        y_test = loan_test['Response']
-        # Calculate metrics
-        accuracy = sk_metrics.accuracy_score(y_test, y_pred)
-        precision = sk_metrics.precision_score(y_test, y_pred, average='weighted')
-        recall = sk_metrics.recall_score(y_test, y_pred, average='weighted')
-        f1 = sk_metrics.f1_score(y_test, y_pred, average='weighted')
-        # Log metrics
-        mlflow.log_metric("Accuracy", accuracy)
-        mlflow.log_metric("Precision", precision)
-        mlflow.log_metric("Recall", recall)
-        mlflow.log_metric("F1 Score", f1)
-        # Log the trained model
-        mlflow.sklearn.log_model(model, "decision_tree_model")
-        mlflow.end_run()
-from sklearn.neighbors import KNeighborsClassifier
-# Split data into training and testing sets
-loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
-# Define the list of n_neighbors values to try
-n_neighbors_values = [3, 5, 7, 9, 11]
-# Loop over each n_neighbors value
-for n_neighbors in n_neighbors_values:
-    with mlflow.start_run(run_name=f"KNN (n_neighbors: {n_neighbors})"):
-        # Initialize and train the KNN model
-        model = KNeighborsClassifier(n_neighbors=n_neighbors)
-        model.fit(loan_train.drop('Response', axis=1), loan_train['Response'])
-        # Log model parameters
-        mlflow.log_param("n_neighbors", n_neighbors)
-        # Predict on the testing set and log metrics
-        y_pred = model.predict(loan_test.drop('Response', axis=1))
-        y_test = loan_test['Response']
-        # Calculate metrics
-        accuracy = sk_metrics.accuracy_score(y_test, y_pred)
-        precision = sk_metrics.precision_score(y_test, y_pred, average='weighted')
-        recall = sk_metrics.recall_score(y_test, y_pred, average='weighted')
-        f1 = sk_metrics.f1_score(y_test, y_pred, average='weighted')
-        # Log metrics
-        mlflow.log_metric("Accuracy", accuracy)
-        mlflow.log_metric("Precision", precision)
-        mlflow.log_metric("Recall", recall)
-        mlflow.log_metric("F1 Score", f1)
-        # Log the trained model
-        mlflow.sklearn.log_model(model, "knn_model")
-        mlflow.end_run()

+import streamlit as st
 import pandas as pd
+from PIL import Image
+import sklearn.metrics as sk_metrics
+from sklearn.model_selection import train_test_split
+from sklearn.neighbors import KNeighborsClassifier
 from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
 from sklearn.model_selection import train_test_split # Import train_test_split function
 from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
 import matplotlib.pyplot as plt
 from sklearn.linear_model import LogisticRegression
 from sklearn.metrics import classification_report
+from codecarbon import EmissionsTracker
+from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report
+import time
 from shapash.explainer.smart_explainer import SmartExplainer
+url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
+st.image(url, output_format="PNG", width=300)
+st.title("MLflow Dashboard")
+st.image('imgs/ml-flow1.jpeg', caption='MLflow Dashbaord')
+st.image('imgs/ml-flow2.jpeg', caption='Comparing Models')
+st.image('imgs/ml-flow3.jpeg', caption='Models sorted by Accuracy')
+st.image('imgs/ml-flow4.jpeg', caption='Winning Model')

requirements.txt CHANGED Viewed

@@ -7,4 +7,6 @@ tensorflow
 matplotlib
 streamlit
 seaborn
-graphviz

 matplotlib
 streamlit
 seaborn
+graphviz
+shapash
+shapash[report]